In [33]:
import pandas as pd
import re
import random
import pickle
import contractions
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [34]:
df = pd.read_csv("Twitter.csv")
df

Unnamed: 0,clean_text,category,category_sentiment
0,when modi promised “minimum government maximum...,-1,negative
1,talk all the nonsense and continue all the dra...,0,neutral
2,what did just say vote for modi welcome bjp t...,1,positive
3,asking his supporters prefix chowkidar their n...,1,positive
4,answer who among these the most powerful world...,1,positive
...,...,...,...
177981,'I'm not satisfied with The Hills finale. gon...,-1,negative
177982,this sucks,-1,negative
177983,this is bad,-1,negative
177984,I am not okay with this,-1,negative


In [35]:
df["category"].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [36]:
def text_transformation(text):
    text = " ".join(x.lower() for x in str(text).split())                             # Converting Text to Lowercase
    text = contractions.fix(text)                                                     # Fixes Contractions such as ("you're" to "you are" etc.)
    text = " ".join([re.sub("[^A-Za-z]+", "", x) for x in word_tokenize(text)])       # Removal of Punctuation, Numbers, and Special Characters                                                               
    return text

In [37]:
df["processed_text"] = df["clean_text"].apply(text_transformation)
df

Unnamed: 0,clean_text,category,category_sentiment,processed_text
0,when modi promised “minimum government maximum...,-1,negative,when modi promised minimum government maximum...
1,talk all the nonsense and continue all the dra...,0,neutral,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...,1,positive,what did just say vote for modi welcome bjp to...
3,asking his supporters prefix chowkidar their n...,1,positive,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...,1,positive,answer who among these the most powerful world...
...,...,...,...,...
177981,'I'm not satisfied with The Hills finale. gon...,-1,negative,i am not satisfied with the hills finale goi...
177982,this sucks,-1,negative,this sucks
177983,this is bad,-1,negative,this is bad
177984,I am not okay with this,-1,negative,i am not okay with this


In [38]:
x = df["processed_text"]
y = df["category_sentiment"]
y_strat = df["category"]

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y_strat)

print("Train:", x_train.shape, y_train.shape)
print("Test: ", x_test.shape, y_test.shape)

Train: (142388,) (142388,)
Test:  (35598,) (35598,)


In [40]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [41]:
def objective_function(x, y, x_validation, y_validation, params):
    model = LinearSVC(
        tol = params["tol"],
        C = params["C"],
        class_weight = "balanced",
        max_iter = params["max_iter"],
        random_state = params["random_state"] 
    )

    model.fit(x, y)

    predictions = model.predict(x_validation)

    return accuracy_score(predictions, y_validation)

In [42]:
num_samples = 20
best_params = None
current_acc = 0.50

for i in range(num_samples):
    params = {
        "tol" : random.uniform(0.0001, 0.00001),
        "C" : random.uniform(0.1, 1),
        "max_iter" : random.randrange(100, 10000, 100),
        "random_state" : random.randrange(1, 10, 1)
    }

    acc = objective_function(x_train, y_train, x_test, y_test, params)
    print("Accuracy: {}".format(acc))

    if acc > current_acc:
        best_params = params
        current_acc = acc

print("Best Accuracy: {}".format(current_acc))
print("Best Params: {}".format(best_params))

Accuracy: 0.9265969998314512
Accuracy: 0.9154727793696275
Accuracy: 0.9278892072588347
Accuracy: 0.9128883645148604
Accuracy: 0.926344176639137
Accuracy: 0.9199393224338446
Accuracy: 0.9242092252373729
Accuracy: 0.9270745547502669
Accuracy: 0.9266812742288892
Accuracy: 0.9205854261475364
Accuracy: 0.9286195853699646
Accuracy: 0.9285072195067139
Accuracy: 0.9286195853699646
Accuracy: 0.927158829147705
Accuracy: 0.9107815045789089
Accuracy: 0.928591493904152
Accuracy: 0.9278049328613968
Accuracy: 0.9267374571605146
Accuracy: 0.9196864992415305
Accuracy: 0.9075790774762628
Best Accuracy: 0.9286195853699646
Best Params: {'tol': 5.854598114993132e-05, 'C': 0.827314960045656, 'max_iter': 7100, 'random_state': 7}


In [43]:
model = LinearSVC(
    tol = best_params["tol"],
    C = best_params["C"],
    class_weight = "balanced",
    max_iter = best_params["max_iter"],
    random_state = best_params["random_state"] 
)

model.fit(x_train, y_train)

predictions = model.predict(x_test)

print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

    negative       0.87      0.89      0.88      8464
     neutral       0.96      0.94      0.95     12770
    positive       0.94      0.94      0.94     14364

    accuracy                           0.93     35598
   macro avg       0.92      0.92      0.92     35598
weighted avg       0.93      0.93      0.93     35598

0.9286195853699646


In [44]:
pickl = {"vectorizer": vectorizer,
         "model": model
         }
pickle.dump(pickl, open('ml_model'+".p", "wb"))