In [1]:
import pandas as pd
data = pd.read_csv('senti9.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [2]:
df.describe

<bound method NDFrame.describe of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [3]:
df.shape

(50000, 2)

In [4]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
df.drop_duplicates(inplace=True)
print(df.shape)

(49582, 2)


In [6]:
print(df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [7]:
import numpy as np
import nltk
from nltk.corpus import stopwords
import string

In [8]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
# to show how the tokenization will take place
df['review'].head().apply(process)

0    [One, reviewers, mentioned, watching, 1, Oz, e...
1    [wonderful, little, production, br, br, filmin...
2    [thought, wonderful, way, spend, time, hot, su...
3    [Basically, theres, family, little, boy, Jake,...
4    [Petter, Matteis, Love, Time, Money, visually,...
Name: review, dtype: object

In [9]:
#convert collection of words in matrix 
from sklearn.feature_extraction.text import TfidfVectorizer
stopset = set(stopwords.words('english'))
vector = TfidfVectorizer(use_idf=True, lowercase=True , strip_accents='ascii' , stop_words = stopset)

In [10]:
message = vector.fit_transform(df['review'])
print(message)

  (0, 81735)	0.046909423357689384
  (0, 22625)	0.0741715723188564
  (0, 91779)	0.057469196048511154
  (0, 90143)	0.06203235610057424
  (0, 96993)	0.05235283229958593
  (0, 94123)	0.07156848297095121
  (0, 18632)	0.07304795993751523
  (0, 8982)	0.045144024513614954
  (0, 56534)	0.036998231077040734
  (0, 31346)	0.04817988305079993
  (0, 82436)	0.06478502242034845
  (0, 86423)	0.05508209420752167
  (0, 50949)	0.04873534078272289
  (0, 10378)	0.10062362659413682
  (0, 93290)	0.049747787045981165
  (0, 17537)	0.0517773419118707
  (0, 58076)	0.04984747481202113
  (0, 55518)	0.08438001666326782
  (0, 98647)	0.02515185509498216
  (0, 37010)	0.05291248867548508
  (0, 64469)	0.049827467485849826
  (0, 49632)	0.04801383491398172
  (0, 45652)	0.1701783995778295
  (0, 62186)	0.10390728059051213
  (0, 83694)	0.07010042149916931
  :	:
  (49581, 43277)	0.08263934792246524
  (49581, 32082)	0.09652644945909525
  (49581, 85442)	0.0981781270951443
  (49581, 100256)	0.08959751784536739
  (49581, 60248)	0.

In [11]:
#spliting the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['sentiment'], test_size=0.20, random_state=4)
# here we can see the shape of the data
print(message.shape)

(49582, 101865)


In [12]:
print(xtrain)

  (0, 67520)	0.1802876392768567
  (0, 10530)	0.36457055918762404
  (0, 46788)	0.14259744842954716
  (0, 6588)	0.1433930916120579
  (0, 25337)	0.12919738158533484
  (0, 29496)	0.13534770680400787
  (0, 96136)	0.24952539190708595
  (0, 31270)	0.3089669200334479
  (0, 85722)	0.1343132659044895
  (0, 38330)	0.10588933340636057
  (0, 9379)	0.13448180076430033
  (0, 20055)	0.11287807332991301
  (0, 73426)	0.09099472584224556
  (0, 72401)	0.13011604104613972
  (0, 62903)	0.11573768239097719
  (0, 16824)	0.09398587785653613
  (0, 85913)	0.09603247032267977
  (0, 69701)	0.12142453028707469
  (0, 44694)	0.09822669271487397
  (0, 30842)	0.06733033113233823
  (0, 84060)	0.11177329209980356
  (0, 70531)	0.11131883513140763
  (0, 78720)	0.13607008167497528
  (0, 45036)	0.09062261663593474
  (0, 23123)	0.07996448061123537
  :	:
  (39664, 7604)	0.05101239175007436
  (39664, 9312)	0.07165270579483903
  (39664, 61687)	0.07846928970584305
  (39664, 93384)	0.07202144721211386
  (39664, 35647)	0.0720165527

In [13]:
print(ytrain)

34694    negative
36086    positive
47515    positive
14690    positive
25605    negative
           ...   
23442    negative
11885    negative
27181    positive
8377     positive
17583    negative
Name: sentiment, Length: 39665, dtype: object


In [14]:
print(xtest)

  (0, 93450)	0.2245688263676923
  (0, 46795)	0.2245688263676923
  (0, 41511)	0.2245688263676923
  (0, 34692)	0.2245688263676923
  (0, 33600)	0.19656816745179317
  (0, 69833)	0.16734299994306095
  (0, 11978)	0.190135973427472
  (0, 70507)	0.13390642485230467
  (0, 21841)	0.10337602635381739
  (0, 27708)	0.1679459762388802
  (0, 1095)	0.14362788566343015
  (0, 87875)	0.1302806728809082
  (0, 55879)	0.13519486311092876
  (0, 68389)	0.15394564678093062
  (0, 35490)	0.13448191124135608
  (0, 56157)	0.12633119859680964
  (0, 89727)	0.11821966715656902
  (0, 18842)	0.14088493817630593
  (0, 62557)	0.10305324545089924
  (0, 47983)	0.1050993987420941
  (0, 83546)	0.09661209633626074
  (0, 57627)	0.12174626402020008
  (0, 77198)	0.12125509803382878
  (0, 69447)	0.10853812260127291
  (0, 85565)	0.11962438642224456
  :	:
  (9916, 38090)	0.011560200904417341
  (9916, 100193)	0.09366912493153756
  (9916, 57)	0.0176394302812594
  (9916, 52755)	0.020798562772008145
  (9916, 33219)	0.018664311347157042

In [15]:
print(ytest)

34403    positive
3542     positive
43726    negative
2060     positive
40294    positive
           ...   
11721    negative
28392    positive
32777    positive
23939    negative
40642    negative
Name: sentiment, Length: 9917, dtype: object


In [16]:
#creating and training the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier1 = MultinomialNB().fit(xtrain, ytrain)

In [17]:
abc = classifier1.predict(xtrain)
abc[1:100]

array(['positive', 'positive', 'positive', 'negative', 'negative',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'negative', 'positive', 'positive', 'negative',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'positive', 'negative', 'positive', 'negative', 'positive',
       'negative', 'positive', 'positive', 'positive', 'negative',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'positive', 'positive', 'positive', 'positive', 'negative',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'negative', 'negative', 'positive', 'positive', 'negative',
       'positive', 'positive', 'negative', 'positive', 'positi

In [18]:
xyz = ytrain.values
xyz[1:100]

array(['positive', 'positive', 'positive', 'negative', 'negative',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'negative', 'negative', 'positive', 'positive', 'negative',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'positive', 'negative', 'positive', 'positive', 'negative',
       'negative', 'positive', 'positive', 'positive', 'negative',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'positive', 'positive', 'positive', 'positive', 'negative',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'negative', 'negative', 'positive', 'positive', 'negative',
       'negative', 'positive', 'negative', 'positive', 'positi

In [19]:
#Model evaluation on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier1.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

              precision    recall  f1-score   support

    negative       0.90      0.92      0.91     19768
    positive       0.92      0.90      0.91     19897

    accuracy                           0.91     39665
   macro avg       0.91      0.91      0.91     39665
weighted avg       0.91      0.91      0.91     39665


Confusion Matrix: 
 [[18167  1601]
 [ 2023 17874]]
Accuracy: 
 0.9086348165889323


In [20]:
# Evaluating the model on the testing data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier1.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      4930
    positive       0.87      0.85      0.86      4987

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917


Confusion Matrix: 
 [[4322  608]
 [ 734 4253]]
Accuracy: 
 0.8646768175859635


In [21]:
#printing the prediction result
print(classifier1.predict(xtest))
#printing  the actual values
print(ytest.values)

['positive' 'positive' 'negative' ... 'positive' 'negative' 'negative']
['positive' 'positive' 'negative' ... 'positive' 'negative' 'negative']


In [22]:
data = pd.read_csv('news3.csv')
df1 = pd.DataFrame(data)
df1


Unnamed: 0,text,time
0,Trump teases possible 2024 run at his 1st big ...,2 days ago
1,Charges expected Thursday for Trump''s company...,14 mins ago
2,Donald Trump heads to US-Mexico border for fre...,5 hours ago
3,With 15 millionaires Biden Cabinet’s net worth...,3 hours ago
4,Trump had to cancel a July 4 weekend rally at ...,4 hours ago
...,...,...
774,Trump Organization and CFO Weisselberg Will Be...,15 mins ago
775,Trump blasts former ally McConnell after book ...,1 day ago
776,Trump's followers do take him literally,2 hours ago
777,Xbox Remote Play Gameplay Usage Review on Seri...,4 hours ago


In [23]:
message1 = df1['text']

In [31]:
message2 = np.array(["taylor is a good slut"])

In [24]:
message1_vector = vector.transform(message1)
print(classifier1.predict(message1_vector))

['negative' 'positive' 'positive' 'negative' 'negative' 'negative'
 'positive' 'negative' 'positive' 'positive' 'positive' 'positive'
 'positive' 'negative' 'negative' 'negative' 'positive' 'positive'
 'positive' 'positive' 'negative' 'negative' 'negative' 'positive'
 'negative' 'positive' 'negative' 'positive' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'positive' 'positive' 'negative' 'positive' 'negative' 'negative'
 'negative' 'positive' 'negative' 'negative' 'negative' 'positive'
 'negative' 'negative' 'negative' 'positive' 'negative' 'negative'
 'positive' 'negative' 'negative' 'negative' 'positive' 'negative'
 'positive' 'positive' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'positive' 'negative' 'positive' 'negative'
 'positive' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'negative' 'negat

In [32]:
message2_vector = vector.transform(message2)
print(classifier1.predict(message2_vector))

['negative']


In [35]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(classifier1, file)


In [20]:
from sklearn import svm
from sklearn import metrics
#create a classifier
rbf1 = svm.SVC(kernel="rbf", random_state = 4)
#train the model
rbf1.fit(xtrain,ytrain)
#predict the response
pred_rbf = rbf1.predict(xtest)

#accuracy
print("acuracy:", metrics.accuracy_score(ytest,pred_rbf))
#precision score
print("precision:", metrics.precision_score(ytest,pred_rbf, average='micro'))
#recall score
print("recall" , metrics.recall_score(ytest,pred_rbf, average = 'micro'))
print(metrics.classification_report(ytest,pred_rbf))

acuracy: 0.8772814359181204
precision: 0.8772814359181204
recall 0.8772814359181204
              precision    recall  f1-score   support

    negative       0.90      0.85      0.87      4930
    positive       0.86      0.91      0.88      4987

    accuracy                           0.88      9917
   macro avg       0.88      0.88      0.88      9917
weighted avg       0.88      0.88      0.88      9917



In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score
print("Confusion Matrix: \n", confusion_matrix(ytest, pred_rbf))
print("Accuracy: \n", accuracy_score(ytest, pred_rbf))

Confusion Matrix: 
 [[4183  747]
 [ 470 4517]]
Accuracy: 
 0.8772814359181204


In [24]:
from sklearn import svm
from sklearn import metrics
#creating a classifier
cls = svm.SVC(kernel="linear",C=1,random_state = 4)
#training the model
cls.fit(xtrain,ytrain)
#predicting the response
pred = cls.predict(xtest)


# printing accuracy
print("acuracy:", metrics.accuracy_score(ytest,pred))
#precision score
print("precision:", metrics.precision_score(ytest,pred, average = 'micro'))
#recall score
print("recall" , metrics.recall_score(ytest,pred, average = 'micro'))
print(metrics.classification_report(ytest, pred))

acuracy: 0.8709287082787133
precision: 0.8709287082787133
recall 0.8709287082787133
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      4930
    positive       0.87      0.87      0.87      4987

    accuracy                           0.87      9917
   macro avg       0.87      0.87      0.87      9917
weighted avg       0.87      0.87      0.87      9917



In [25]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 4)
classifier2.fit(xtrain, ytrain)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=4)

In [26]:
#predictions
predictions = classifier1.predict(xtest)
predictions

array(['positive', 'positive', 'negative', ..., 'positive', 'negative',
       'negative'], dtype='<U8')

In [27]:
ytest

34403    positive
3542     positive
43726    negative
2060     positive
40294    positive
           ...   
11721    negative
28392    positive
32777    positive
23939    negative
40642    negative
Name: sentiment, Length: 9917, dtype: object

In [28]:
from sklearn.metrics import classification_report
print(classification_report(ytest,predictions))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      4930
    positive       0.88      0.84      0.86      4987

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917



In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(ytest, predictions))
accuracy_score(ytest, predictions)

[[4351  579]
 [ 812 4175]]


0.859735807199758