In [1]:
import pandas as pd
df = pd.read_csv("Covid_tweets_US.csv")

review_df = df[['text','Polarity']]

review_df = review_df[review_df['Polarity'] != 'Neutral']

review_df["Polarity"].value_counts()

Positive    154698
Negative     64326
Name: Polarity, dtype: int64

In [2]:
sentiment_label = review_df.Polarity.factorize()
sentiment_label

(array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 Index(['Positive', 'Negative'], dtype='object'))

In [3]:
review_df['Label'] = review_df.Polarity.map({'Positive':0, 'Negative':1})

In [4]:
review_df.head()

Unnamed: 0,text,Polarity,Label
0,mayor janefortampa janecastor done amazing job...,Positive,0
1,friend genezahas died fr coronavirus alamedaco...,Positive,0
3,keep questioning normal look like get nightmar...,Positive,0
4,ask better view making call amykennedy tonight...,Positive,0
5,lovely seder homemade matzah secret making mat...,Positive,0


In [6]:
X = review_df.text
y = review_df.Label
print(X.shape)
print(y.shape)

(219024,)
(219024,)


In [7]:
X

0         mayor janefortampa janecastor done amazing job...
1         friend genezahas died fr coronavirus alamedaco...
3         keep questioning normal look like get nightmar...
4         ask better view making call amykennedy tonight...
5         lovely seder homemade matzah secret making mat...
                                ...                        
357491    weirdest quarentine meal straight meatball sau...
357493    really funny people tweeple covid stayhome bit...
357494    famous santa monica stair shut covid lot folk ...
357495    surreal nature daily covid routine first ita t...
357497            road happiness covid wewillgetthroughthis
Name: text, Length: 219024, dtype: object

In [8]:
y

0         0
1         0
3         0
4         0
5         0
         ..
357491    0
357493    0
357494    0
357495    0
357497    0
Name: Label, Length: 219024, dtype: int64

In [9]:
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.20, random_state =42)

In [10]:
X_train.shape

(175219,)

In [11]:
y_train.shape

(175219,)

In [12]:
print(X_test.shape)
print(y_test.shape)

(43805,)
(43805,)


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [14]:
vect.fit(X_train)
vect.vocabulary_

{'buying': 13969,
 'local': 61966,
 'vegetable': 114582,
 'help': 46232,
 'stay': 100894,
 'business': 13824,
 'farmer': 35837,
 'market': 64843,
 'closed': 19173,
 'due': 30672,
 'covid': 22255,
 'pandemic': 79159,
 'yasukochifamilyfarm': 120719,
 'oceanside': 76466,
 'taking': 104369,
 'produce': 84754,
 'directly': 27608,
 'home': 47471,
 'delivering': 26105,
 'csa': 23420,
 'box': 12121,
 'full': 39835,
 'fresh': 39307,
 'agriculture': 1813,
 'go': 42157,
 'forever': 38448,
 'least': 59920,
 'never': 73102,
 'called': 14390,
 'american': 3326,
 'family': 35672,
 'field': 36717,
 'millerpark': 68155,
 'baseball': 7998,
 'mlb': 68910,
 'brewer': 12677,
 'positivity': 83470,
 'thatgirlondeck': 106175,
 'schrades': 94199,
 'please': 82497,
 'step': 101413,
 'away': 6902,
 'thanks': 106038,
 'evildead': 34579,
 'ashvsevildead': 5743,
 'horror': 47926,
 'horrormovies': 47931,
 'groovybruce': 43860,
 'coronavirus': 21513,
 'south': 99418,
 'yulee': 121373,
 'fl': 37418,
 'weigh': 117393,


In [15]:
vect = CountVectorizer(stop_words='english')

In [16]:
vectorizer = vect.fit(X)

In [17]:
vect.vocabulary_

{'mayor': 67361,
 'janefortampa': 54730,
 'janecastor': 54724,
 'amazing': 3262,
 'job': 55991,
 'keeping': 58040,
 'resident': 91964,
 'cityoftampa': 19023,
 'healthy': 46922,
 'safe': 94897,
 'applauded': 4853,
 'work': 122559,
 'winning': 121949,
 'war': 119224,
 'covid': 22791,
 'leadership': 61205,
 'action': 854,
 'friend': 40305,
 'genezahas': 41901,
 'died': 27939,
 'fr': 39746,
 'coronavirus': 22028,
 'alamedacounty': 2281,
 'born': 12169,
 'westoakland': 120727,
 'greek': 44536,
 'munity': 72591,
 'true': 113332,
 'oakland': 78071,
 'citizen': 18819,
 'quietly': 88838,
 'glued': 43067,
 'city': 18848,
 'worked': 122576,
 'chair': 17017,
 'treasurer': 112895,
 'ballotmeasures': 7760,
 'oaklandschools': 78077,
 'rip': 93019,
 'questioning': 88779,
 'normal': 76660,
 'look': 63822,
 'like': 62536,
 'nightmare': 75720,
 'people': 82643,
 'avoid': 6986,
 'crowded': 23816,
 'place': 84235,
 'concert': 20612,
 'sport': 102497,
 'event': 35230,
 'theater': 108906,
 'mall': 65744,
 't

In [18]:
print(len(vect.get_feature_names()))

124841


In [19]:
X_train_transformed = vect.transform(X_train)
X_test_transformed =vect.transform(X_test)

In [20]:
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 1854)	1
  (0, 12409)	1
  (0, 14161)	1
  (0, 14308)	1
  (0, 19637)	1
  (0, 22791)	1
  (0, 23979)	1
  (0, 26729)	1
  (0, 28265)	1
  (0, 36659)	1
  (0, 40202)	1
  (0, 47311)	1
  (0, 48578)	1
  (0, 63449)	1
  (0, 66407)	1
  (0, 78315)	1
  (0, 81063)	1
  (0, 86798)	2
  (0, 103352)	1
  (0, 106920)	1
  (0, 117392)	1
  (0, 123695)	1
  (1, 3399)	1
  (1, 8170)	1
  (1, 12981)	1
  :	:
  (175217, 42313)	1
  (175217, 47311)	1
  (175217, 48812)	1
  (175217, 50069)	1
  (175217, 61300)	1
  (175217, 64773)	1
  (175217, 72589)	1
  (175217, 79270)	1
  (175217, 83312)	1
  (175217, 86217)	1
  (175217, 99080)	1
  (175217, 122116)	1
  (175217, 122559)	1
  (175218, 3813)	1
  (175218, 4103)	1
  (175218, 22791)	1
  (175218, 33524)	1
  (175218, 70071)	1
  (175218, 74488)	1
  (175218, 77008)	1
  (175218, 81063)	1
  (175218, 87670)	1
  (175218, 90856)	1
  (175218, 99898)	1
  (175218, 111558)	1


In [21]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
naive_model = mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

In [22]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8767492295400069

In [23]:
y_pred_class

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [24]:
x = metrics.confusion_matrix(y_test, y_pred_class)
x

array([[28235,  2786],
       [ 2613, 10171]], dtype=int64)

In [25]:
y_pred_class

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [26]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TN = confusion[1, 1]
FP = confusion[1, 0]
FN = confusion[0, 1]
TP = confusion[0, 0]

[[28235  2786]
 [ 2613 10171]]


In [27]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

sensitivity 0.910189871377454


In [28]:
specificity = TN / float(TN + FP)

print("specificity",specificity)

specificity 0.7956038798498123


In [29]:
precision = TP / float(TP + FP)

print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

precision 0.9152943464730291
0.7849810913019989


In [30]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

precision 0.9152943464730291
PRECISION SCORE : 0.7849810913019989
RECALL SCORE : 0.7956038798498123
F1 SCORE : 0.7902567887805447


In [31]:
import joblib
fin_model = naive_model
joblib.dump(fin_model, 'naive_model.joblib')
joblib.dump(vectorizer, 'CountVectorizer.joblib')

['CountVectorizer.joblib']