In [2]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('/home/anu/Downloads/mutilabel_sentimental_dataset/sentimental_train.tsv',sep='\t')
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [4]:
df.shape

(156060, 4)

In [5]:
df.isna().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [8]:
print(df['Sentiment'].value_counts())

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64


In [9]:
print(df['Sentiment'].value_counts()/df['Sentiment'].count())
#TF for each rows

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64


In [10]:
df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [22]:
import nltk
import re

def clear_text(text):
    text=re.sub("/","",text)
    text=re.sub("[^a-zA-Z]"," ",text)
    text=' '.join(text.split())
    text=text.lower()
    
    return text
    

In [23]:
df['clean_txt']=df['Phrase'].apply(lambda x:clear_text(x))

In [28]:
df['clean_txt'][0]

'a series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of a story'

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV

def main():
    pipe=Pipeline([('vect',TfidfVectorizer(stop_words='english')),\
                  ('est',LogisticRegression(random_state=0,multi_class='ovr',max_iter=10000))])
    parameters={
        'vect__max_df':(0.25,0.5),
        'vect__ngram_range':((1,2),(1,3)),
        'vect__use_idf':(True,False),
        'vect__norm':('l1','l2'),
        'est__penalty':('l1','l2'),
        'est__C':(0.1,1,10),
    }
    
    #data
    
    x,y=df['clean_txt'],df['Sentiment']#splitting x,y
    xtrain,xtest,ytrain,ytest=train_test_split(x,y)
    print(xtrain.shape)
    
    grid_search=GridSearchCV(pipe,parameters,n_jobs=-1,verbose=1,scoring='accuracy',cv=5)
    grid_search.fit(xtrain,ytrain)
    
    print('Best score:',grid_search.best_score_)
    print('\nBest Parameter Set:')
    best_params=grid_search.best_estimator_.get_params()
    
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r'%(param_name,best_params[param_name]))
    
    from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
    predictions=grid_search.predict(xtest)
    print('Accuracy',accuracy_score(ytest,predictions))
    print('Confusion matrix:',confusion_matrix(ytest,predictions))
    print('classification report:',classification_report(ytest,predictions))
    
    
if __name__=='__main__':
    main()

(117045,)
Fitting 5 folds for each of 96 candidates, totalling 480 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.51047033 0.51191422
 0.51917639 0.5293605  0.51026528 0.51175189 0.5160323  0.52476398
 0.51047033 0.51191422 0.51917639 0.5293605  0.51026528 0.51175189
 0.5160323  0.52476398        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.53218847 0.53605024 0.60988509 0.61363578 0.52759195 0.5320176
 0.60558759 0.61108975 0.53218847 0.53605024 0.60988509 0.61363578
 0.52759195 0.5320176  0.60558759 0.61108975        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.58075954 0.57699176 0.63813063 0.64224871
 0.5785638  0.57476184 0.63682344 0.64114657 0.58075954 0.57699176
 0.63813063 0.64224871 0.5785638  0.57476184 0.63682344 0.64114

Best score: 0.6422487077619718

Best Parameter Set:
	est__C: 10
	est__penalty: 'l2'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
	vect__norm: 'l2'
	vect__use_idf: False
Accuracy 0.6529539920543381
Confusion matrix: [[  615   876   235    27     3]
 [  427  3200  2815   254    10]
 [  111  1600 16459  1776    71]
 [   10   169  3111  4340   598]
 [    4    15   233  1195   861]]
classification report:               precision    recall  f1-score   support

           0       0.53      0.35      0.42      1756
           1       0.55      0.48      0.51      6706
           2       0.72      0.82      0.77     20017
           3       0.57      0.53      0.55      8228
           4       0.56      0.37      0.45      2308

    accuracy                           0.65     39015
   macro avg       0.58      0.51      0.54     39015
weighted avg       0.64      0.65      0.64     39015



In [None]:
#we got 5 reports because we have 5 sentiment values in column

In [35]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_txt
0,1,1,A series of escapades demonstrating the adage ...,1,a series of escapades demonstrating the adage ...
1,2,1,A series of escapades demonstrating the adage ...,2,a series of escapades demonstrating the adage ...
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
...,...,...,...,...,...
156055,156056,8544,Hearst 's,2,hearst s
156056,156057,8544,forced avuncular chortles,1,forced avuncular chortles
156057,156058,8544,avuncular chortles,3,avuncular chortles
156058,156059,8544,avuncular,2,avuncular
