In [1]:
import pandas as pd
import re
import random
import pickle
import contractions

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
df = pd.read_csv("Twitter.csv")
df

Unnamed: 0,clean_text,category,category_sentiment
0,i am not happy,-1,negative
1,i am not sad,1,positive
2,i'm fine,0,neutral
3,when modi promised “minimum government maximum...,-1,negative
4,talk all the nonsense and continue all the dra...,0,neutral
...,...,...,...
177971,'I'm not satisfied with The Hills finale. gon...,-1,negative
177972,this sucks,-1,negative
177973,this is bad,-1,negative
177974,I am not okay with this,-1,negative


In [3]:
df["category"].value_counts()

 1    72250
 0    62712
-1    43014
Name: category, dtype: int64

In [4]:
def text_transformation(text):
    text = " ".join(x.lower() for x in str(text).split())                             # Converting Text to Lowercase
    text = contractions.fix(text)                                                     # Fixes Contractions such as ("you're" to "you are" etc.)
    text = " ".join([re.sub("[^A-Za-z]+", "", x) for x in word_tokenize(text)])       # Removal of Punctuation, Numbers, and Special Characters                                                                   
    return text

In [5]:
df["processed_text"] = df["clean_text"].apply(text_transformation)
df

Unnamed: 0,clean_text,category,category_sentiment,processed_text
0,i am not happy,-1,negative,i am not happy
1,i am not sad,1,positive,i am not sad
2,i'm fine,0,neutral,i am fine
3,when modi promised “minimum government maximum...,-1,negative,when modi promised minimum government maximum...
4,talk all the nonsense and continue all the dra...,0,neutral,talk all the nonsense and continue all the dra...
...,...,...,...,...
177971,'I'm not satisfied with The Hills finale. gon...,-1,negative,i am not satisfied with the hills finale goi...
177972,this sucks,-1,negative,this sucks
177973,this is bad,-1,negative,this is bad
177974,I am not okay with this,-1,negative,i am not okay with this


In [6]:
x = df["processed_text"]
y = df["category_sentiment"]
y_strat = df["category"]

vectorizer = TfidfVectorizer(sublinear_tf = True)
x_vectorized = vectorizer.fit_transform(x)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size=0.2, random_state=42, stratify=y_strat)

print("Train:", x_train.shape, y_train.shape)
print("Test: ", x_test.shape, y_test.shape)

Train: (142380, 112550) (142380,)
Test:  (35596, 112550) (35596,)


In [8]:
def objective_function(x, y, x_validation, y_validation, params):
    model = LogisticRegression(
        tol = params['tol'],
        C = params['C'],
        class_weight = 'balanced',
        solver = 'lbfgs',
        max_iter = params['max_iter'],
        multi_class = 'multinomial',
    )

    model.fit(x, y)

    predictions = model.predict(x_validation)

    return accuracy_score(predictions, y_validation)

In [9]:
num_samples = 15
best_params = None
current_acc = 0.50

for i in range(num_samples):
    params = {
        'tol' : random.uniform(0.0001, 0.00001),
        'C' : random.uniform(0.1, 1),
        'max_iter' : random.randrange(100, 10000, 100),
    }

    acc = objective_function(x_train, y_train, x_test, y_test, params)
    print("Accuracy: {}".format(acc))

    if acc > current_acc:
        best_params = params
        current_acc = acc

print("Best Accuracy: {}".format(current_acc))
print("Best Params: {}".format(best_params))

Accuracy: 0.9045117428924598
Accuracy: 0.9049331385548938
Accuracy: 0.8952691313630745
Accuracy: 0.8846780537139004
Accuracy: 0.8990055062366558
Accuracy: 0.8948196426564783
Accuracy: 0.9033318350376447
Accuracy: 0.9035284863467805
Accuracy: 0.8895100573098101
Accuracy: 0.9050736037757051
Accuracy: 0.890774244297112
Accuracy: 0.8951848522305877
Accuracy: 0.9050455107315428
Accuracy: 0.8995673671199012
Accuracy: 0.8991178784133048
Best Accuracy: 0.9050736037757051
Best Params: {'tol': 1.5000359845651024e-05, 'C': 0.9973358517896193, 'max_iter': 8500}


In [10]:
model = LogisticRegression(
        tol = best_params['tol'],
        C = best_params['C'],
        class_weight = 'balanced',
        solver = 'lbfgs',
        max_iter = best_params['max_iter'],
        multi_class = 'multinomial',
    )

model.fit(x_train, y_train)

predictions = model.predict(x_test)

print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

    negative       0.86      0.85      0.86      8717
     neutral       0.95      0.90      0.93     13300
    positive       0.89      0.95      0.92     13579

    accuracy                           0.91     35596
   macro avg       0.90      0.90      0.90     35596
weighted avg       0.91      0.91      0.91     35596

0.9050736037757051


In [24]:
pickl = {'vectorizer': vectorizer,
         'model': model
         }
pickle.dump(pickl, open('ml_model'+".p", "wb"))

In [12]:
test_feature = vectorizer.transform(["You have a fast and awesome response :)"])
model.predict(test_feature)

array(['positive'], dtype=object)

In [13]:
test_feature = vectorizer.transform(["This food is disgusting! I will not eat something like this!"])
model.predict(test_feature)

array(['negative'], dtype=object)

In [14]:
test_feature = vectorizer.transform(["The product is very good."])
model.predict(test_feature)

array(['positive'], dtype=object)

In [15]:
test_feature = vectorizer.transform(["I hate the service. The queue is very long."])
model.predict(test_feature)

array(['negative'], dtype=object)

In [16]:
test_feature = vectorizer.transform(["This place is uninfluenced by fashion or trends."])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [17]:
test_feature = vectorizer.transform(["This is an apple."])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [18]:
test_feature = vectorizer.transform(["How are you?"])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [19]:
test_feature = vectorizer.transform(["This sucks!"])
model.predict(test_feature)

array(['negative'], dtype=object)

In [20]:
test_feature = vectorizer.transform(["She is disinterested in everything."])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [21]:
test_feature = vectorizer.transform(["The item is working as intended. Great!"])
model.predict(test_feature)

array(['positive'], dtype=object)

In [22]:
test_feature = vectorizer.transform(["My phone is not working! The touch response is really slow."])
model.predict(test_feature)

array(['negative'], dtype=object)

In [23]:
test_feature = vectorizer.transform(["I am delighted and happy"])
model.predict(test_feature)

array(['positive'], dtype=object)