# Importing & Loading

In [45]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('data.csv', encoding = 'ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [4]:
df.shape

(4815, 9)

In [5]:
df.columns

Index(['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User',
       'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'],
      dtype='object')

In [6]:
df.columns = ['id', 'text', 'sentiment', 'time', 'age', 'country', 'population', 'area', 'density']

In [7]:
df.head()

Unnamed: 0,id,text,sentiment,time,age,country,population,area,density
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [8]:
df.drop(['id', 'population', 'country', 'population', 'area', 'density'], axis = 1, inplace = True)

In [9]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


# Pre-Processing

In [10]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


In [11]:
df.isnull().sum()

text         1281
sentiment    1281
time         1281
age          1281
dtype: int64

In [12]:
df.dropna(subset = ['text'], inplace = True)

In [13]:
df.isnull().sum()

text         0
sentiment    0
time         0
age          0
dtype: int64

In [14]:
df.shape

(3534, 4)

In [15]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


In [16]:
for col in ['time', 'age', 'sentiment']:
    print(df[col].value_counts(), end = '\n\n')

morning    1178
noon       1178
night      1178
Name: time, dtype: int64

0-20      590
21-30     590
31-45     590
46-60     588
60-70     588
70-100    588
Name: age, dtype: int64

neutral     1430
positive    1103
negative    1001
Name: sentiment, dtype: int64



In [17]:
text = df.text

In [21]:
text.replace('[^a-zA-Z]', ' ', regex = True, inplace = True)

In [22]:
text.head()

0    Last session of the day  http   twitpic com   ezh
1     Shanghai is also really exciting  precisely  ...
2    Recession hit Veronique Branquinho  she has to...
3                                          happy bday 
4               http   twitpic com  w  p   I like it  
Name: text, dtype: object

In [26]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

rows = list()
for row in range(0, len(text.index)):
    words = nltk.word_tokenize(text[row])
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    rows.append(' '.join(words))

In [29]:
rows[0:5]

['last session day http twitpic com ezh',
 'shanghai also realli excit precis skyscrap galor good tweep china sh bj',
 'recess hit veroniqu branquinho quit compani shame',
 'happi bday',
 'http twitpic com w p i like']

In [31]:
vectorizer = CountVectorizer()
vectorizer.fit(rows)

CountVectorizer()

In [32]:
rows = vectorizer.transform(rows)

In [33]:
rows

<3534x5788 sparse matrix of type '<class 'numpy.int64'>'
	with 25628 stored elements in Compressed Sparse Row format>

In [36]:
rows.shape

(3534, 5788)

# Train Test split

In [59]:
x_train, x_test, y_train, y_test = train_test_split(rows, df.sentiment, test_size = 0.30, random_state = 42)

In [60]:
x_train.shape, x_test.shape

((2473, 5788), (1061, 5788))

In [54]:
model = MultinomialNB()

In [61]:
model.fit(x_train, y_train)

MultinomialNB()

In [62]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

    negative       0.65      0.54      0.59       306
     neutral       0.57      0.61      0.59       423
    positive       0.66      0.71      0.69       332

    accuracy                           0.62      1061
   macro avg       0.63      0.62      0.62      1061
weighted avg       0.62      0.62      0.62      1061



In [63]:
accuracy_score(y_test, model.predict(x_test))

0.6201696512723845

In [64]:
accuracy_score(y_train, model.predict(x_train))

0.9069955519611808

# Conclusion
overfitting model