In [1]:
import pandas as pd
import re
import random
import pickle
import contractions
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [2]:
df = pd.read_csv("Twitter.csv")
df

Unnamed: 0,clean_text,category,category_sentiment
0,when modi promised “minimum government maximum...,-1,negative
1,talk all the nonsense and continue all the dra...,0,neutral
2,what did just say vote for modi welcome bjp t...,1,positive
3,asking his supporters prefix chowkidar their n...,1,positive
4,answer who among these the most powerful world...,1,positive
...,...,...,...
177981,'I'm not satisfied with The Hills finale. gon...,-1,negative
177982,this sucks,-1,negative
177983,this is bad,-1,negative
177984,I am not okay with this,-1,negative


In [3]:
df["category"].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [4]:
def text_transformation(text):
    text = " ".join(x.lower() for x in str(text).split())                             # Converting Text to Lowercase
    text = contractions.fix(text)                                                     # Fixes Contractions such as ("you're" to "you are" etc.)
    text = " ".join([re.sub("[^A-Za-z]+", "", x) for x in word_tokenize(text)])       # Removal of Punctuation, Numbers, and Special Characters                                                                  
    return text

In [5]:
df["processed_text"] = df["clean_text"].apply(text_transformation)
df

In [None]:
x = df["processed_text"]
y = df["category_sentiment"]
y_strat = df["category"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y_strat)

print("Train:", x_train.shape, y_train.shape)
print("Test: ", x_test.shape, y_test.shape)

Train: (142388,) (142388,)
Test:  (35598,) (35598,)


In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
def objective_function(x, y, x_validation, y_validation, params):
    model = LinearSVC(
        tol = params["tol"],
        C = params["C"],
        max_iter = params["max_iter"],
        random_state = params["random_state"] 
    )

    model.fit(x, y)

    predictions = model.predict(x_validation)

    return accuracy_score(predictions, y_validation)

In [None]:
num_samples = 20
best_params = None
current_acc = 0.50

for i in range(num_samples):
    params = {
        "tol" : random.uniform(0.0001, 0.00001),
        "C" : random.uniform(0.1, 1),
        "max_iter" : random.randrange(100, 10000, 100),
        "random_state" : random.randrange(1, 10, 1)
    }

    acc = objective_function(x_train, y_train, x_test, y_test, params)
    print("Accuracy: {}".format(acc))

    if acc > current_acc:
        best_params = params
        current_acc = acc

print("Best Accuracy: {}".format(current_acc))
print("Best Params: {}".format(best_params))

Accuracy: 0.9338445980111242
Accuracy: 0.9318781954042362
Accuracy: 0.929209506152031
Accuracy: 0.9326647564469914
Accuracy: 0.933142311365807
Accuracy: 0.9214000786561043
Accuracy: 0.9333670430923086
Accuracy: 0.9348839822461936
Accuracy: 0.9297432440024721
Accuracy: 0.9215967189167931
Accuracy: 0.9330861284341817
Accuracy: 0.9336479577504354
Accuracy: 0.9348558907803809
Accuracy: 0.9346311590538794
Accuracy: 0.9346873419855048
Accuracy: 0.9321591100623631
Accuracy: 0.9317658295409854
Accuracy: 0.9283948536434631
Accuracy: 0.9339850553401876
Accuracy: 0.9211753469296028
Best Accuracy: 0.9348839822461936
Best Params: {'tol': 7.52575310080993e-05, 'C': 0.9794240140812909, 'max_iter': 6400, 'random_state': 7}


In [None]:
model = LinearSVC(
    tol = best_params["tol"],
    C = best_params["C"],
    max_iter = best_params["max_iter"],
    random_state = best_params["random_state"] 
)

model.fit(x_train, y_train)

predictions = model.predict(x_test)

print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

    negative       0.87      0.91      0.89      8209
     neutral       0.96      0.94      0.95     12837
    positive       0.95      0.95      0.95     14552

    accuracy                           0.93     35598
   macro avg       0.93      0.93      0.93     35598
weighted avg       0.94      0.93      0.94     35598

0.9348839822461936
