In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

training_set=pd.read_csv('../dataset/labeledTrainData.tsv',sep='\t')
training_set.head(2)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."


In [4]:
# train_test_split
y_train=training_set['sentiment'].values
x_train=training_set['review'].values
train_data,test_data,train_labels,test_labels=train_test_split(x_train,y_train,shuffle=True,
test_size=0.25,random_state=42,stratify=y_train)

In [5]:
# vectorization | convert text to number
# LogisticRegression works of number so we need to vectorized the words.

tf=TfidfVectorizer(min_df=10,max_df=0.95,use_idf=True,ngram_range=(1,3))

tf.fit_transform(train_data)
train_feature_set=tf.transform(train_data) # for train data we can use fit_transfrom also.
test_feature_set=tf.transform(test_data)

train_feature_set.shape[1]

90185

In [6]:
classes=np.unique(train_labels)
print('classes: ',classes)

classes:  [0 1]


In [7]:
# Logistic Regression.
#  Training phase....
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# classes are 0 and 1 only
tf_model = LogisticRegression(penalty='l2',C=1,solver = 'liblinear', random_state = 42, max_iter=1000)
print ("------------------Training In Progress------------------------")
print ("Training Examples: ",train_data.shape)
# model.fit(train_data,train_labels)
tf_model.fit(train_feature_set,train_labels)
print ('------------------------Training Completed!')

# Testing phase 
y_pred = tf_model.predict(test_feature_set)
accuracy_score = round(metrics.accuracy_score(test_labels,y_pred),3)
print("Accuracy: ",accuracy_score)
print("F1: ",round(metrics.f1_score(test_labels, y_pred),3))


------------------Training In Progress------------------------
Training Examples:  (18750,)
------------------------Training Completed!
Accuracy:  0.891
F1:  0.892


In [9]:
# testing
test=pd.read_csv('../dataset/testData.tsv',sep='\t')
Xtest=test.review.values
Xtest[0]

"Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty."

In [10]:
# test
test_array = Xtest[0:2]
test_review = tf.transform(test_array)
classes = tf_model.predict(test_review)
for sentence,classes in zip(test_array,classes):
    print(sentence,':','pos' if classes ==1 else 'neg')

Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty. : pos
This movie is a disaster within a disaster film. It is full of great action scenes, which are only meaningful if you throw away all sense of reality. Let's see, word to the wise, lava burns you; steam burns you. You can't st

# save model

In [13]:
# pickling the vectorizer
pickle.dump(tf, open('tf_vectorizer.sav', 'wb'))
# pickling the model
pickle.dump(tf_model, open('tf_model.sav', 'wb'))

# load model

In [14]:
tf_model = pickle.load(open('tf_model.sav','rb'))
tf_vectorizer = pickle.load(open('tf_vectorizer.sav','rb'))

In [15]:
x = input('Enter a text: ')
x_transform = tf_vectorizer.transform([x])

print(x,':','pos' if tf_model.predict(x_transform)[0] ==1 else 'neg')

nice dog : pos
