In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
sentiment_data=pd.read_csv('Dataset/tweet/train_data.csv',header=None,names=['Label','Text'],sep='\t')
sentiment_data.sample(10)

Unnamed: 0,Label,Text
6583,0,Brokeback Mountain was boring.
535,1,DA VINCI CODE IS AWESOME!!
3474,1,I love Brokeback Mountain....
3721,1,"Anyway, thats why I love "" Brokeback Mountain."
1392,1,Mission Impossible 3 was excellent.
2517,1,I love Harry Potter..
5453,0,"Not because I hate Harry Potter, but because I..."
269,1,Love luv lubb the Da Vinci Code!
6142,0,"Oh, and Brokeback Mountain is a TERRIBLE movie..."
5285,0,These Harry Potter movies really suck.


In [3]:
X=sentiment_data['Text']
Y=sentiment_data['Label']
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)

In [4]:
x_train.shape

(5534,)

In [5]:
tfidf_vect=TfidfVectorizer(max_features=15)
x_trans=tfidf_vect.fit_transform(x_train)

In [6]:
print(x_trans[:2])

  (0, 1)	0.476992830891
  (0, 14)	0.471745128925
  (0, 3)	0.38086960393
  (0, 13)	0.380977170352
  (0, 4)	0.380977170352
  (0, 12)	0.338504813491
  (1, 7)	0.476559234752
  (1, 0)	0.451966200606
  (1, 11)	0.422209464007
  (1, 5)	0.422209464007
  (1, 8)	0.460539017118


In [7]:
classifier=LinearSVC(C=1.0,max_iter=1000,tol=1e-3)
linear_svc_model=classifier.fit(x_trans,y_train)
linear_svc_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

In [8]:
x_test_trans=tfidf_vect.fit_transform(x_test)

In [9]:
y_pred=linear_svc_model.predict(x_test_trans)


In [10]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.89306358381502893

In [11]:
sciket_version=sklearn.__version__
text_clf_param={}
text_clf_param['preprocessing']=tfidf_vect
text_clf_param['model']=linear_svc_model
text_clf_param['sklearn_version']=sciket_version
text_clf_param['accuracy']=accuracy

Saving the model

In [12]:
import joblib
filename='models/text_clf_checkpoint.joblib'
joblib.dump(text_clf_param,filename)

['models/text_clf_checkpoint.joblib']

In [13]:
#reloading model
clf_checkpoint=joblib.load(filename)

In [14]:

reload_vect=clf_checkpoint['preprocessing']
reload_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=15, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

# creating a pipeline for model

In [15]:
from sklearn.pipeline import Pipeline
clf_pipeline=Pipeline(steps=[('tfidf_vect',tfidf_vect),('classifier',classifier)])
pipeline_model=clf_pipeline.fit(x_train,y_train)

In [16]:
y_pred=pipeline_model.predict(x_test)

In [17]:
accuracy_score(y_test,y_pred)

0.89306358381502893

In [18]:
pipe_clf_param={}
pipe_clf_param['pipeline_clf']=pipeline_model
pipe_clf_param['sklearn_version']=sciket_version
pipe_clf_param['accuracy']=accuracy

In [19]:
filename='models/pipe_clf_checkpoint.joblib'
joblib.dump(pipe_clf_param,filename)

['models/pipe_clf_checkpoint.joblib']

In [20]:
pipe_clf_checkpoint=joblib.load(filename)
print(pipe_clf_checkpoint)
reload_clf=pipe_clf_checkpoint['pipeline_clf']
y_pred=reload_clf.predict(x_test)

{'pipeline_clf': Pipeline(memory=None,
     steps=[('tfidf_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=15, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0))]), 'sklearn_version': '0.19.1', 'accuracy': 0.89306358381502893}


In [21]:
accuracy_score(y_test,y_pred)

0.89306358381502893