# Importing

In [61]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data.csv', encoding = 'ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [4]:
df.shape

(4815, 9)

In [5]:
df.columns

Index(['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User',
       'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'],
      dtype='object')

In [6]:
df.columns = ['id', 'text', 'sentiment', 'time', 'age', 'country', 'population', 'area', 'density']

In [7]:
df.head()

Unnamed: 0,id,text,sentiment,time,age,country,population,area,density
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [8]:
df.drop(['id', 'population', 'country', 'population', 'area', 'density'], axis = 1, inplace = True)

In [9]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


# Processing

In [10]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


In [11]:
df.isnull().sum()

text         1281
sentiment    1281
time         1281
age          1281
dtype: int64

In [12]:
df.dropna(subset = ['text'], inplace = True)

In [13]:
df.isnull().sum()

text         0
sentiment    0
time         0
age          0
dtype: int64

In [14]:
df.shape

(3534, 4)

In [15]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20
1,Shanghai is also really exciting (precisely -...,positive,noon,21-30
2,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45
3,happy bday!,positive,morning,46-60
4,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70


In [16]:
for col in ['time', 'age', 'sentiment']:
    print(df[col].value_counts(), end = '\n\n')

morning    1178
noon       1178
night      1178
Name: time, dtype: int64

0-20      590
21-30     590
31-45     590
46-60     588
60-70     588
70-100    588
Name: age, dtype: int64

neutral     1430
positive    1103
negative    1001
Name: sentiment, dtype: int64



In [17]:
text = df.text

In [18]:
text.replace('[^a-zA-Z]', ' ', regex = True, inplace = True)

In [19]:
text.head()

0    Last session of the day  http   twitpic com   ezh
1     Shanghai is also really exciting  precisely  ...
2    Recession hit Veronique Branquinho  she has to...
3                                          happy bday 
4               http   twitpic com  w  p   I like it  
Name: text, dtype: object

In [20]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

rows = list()
for row in range(0, len(text.index)):
    words = nltk.word_tokenize(text[row])
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    rows.append(' '.join(words))

In [21]:
rows[0:5]

['last session day http twitpic com ezh',
 'shanghai also realli excit precis skyscrap galor good tweep china sh bj',
 'recess hit veroniqu branquinho quit compani shame',
 'happi bday',
 'http twitpic com w p i like']

In [22]:
vectorizer = CountVectorizer()
vectorizer.fit(rows)

CountVectorizer()

In [23]:
rows = vectorizer.transform(rows)

In [24]:
rows.shape

(3534, 5788)

In [25]:
rows.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
vectorizer.get_feature_names_out()

array(['aa', 'aaaaa', 'aaaaaaaa', ..., 'zr', 'zs', 'zt'], dtype=object)

In [27]:
df_text = pd.DataFrame(rows.toarray(), columns = [str(i) for i in range(rows.shape[1])])

In [28]:
df_text.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5778,5779,5780,5781,5782,5783,5784,5785,5786,5787
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df_text.shape

(3534, 5788)

In [30]:
df.head()

Unnamed: 0,text,sentiment,time,age
0,Last session of the day http twitpic com ezh,neutral,morning,0-20
1,Shanghai is also really exciting precisely ...,positive,noon,21-30
2,Recession hit Veronique Branquinho she has to...,negative,night,31-45
3,happy bday,positive,morning,46-60
4,http twitpic com w p I like it,positive,noon,60-70


In [37]:
df_final = pd.concat([df_text, pd.get_dummies(df.iloc[:, 2:]), df.iloc[:, 1]], axis = 1, join = 'inner')

In [38]:
df_final.shape

(3534, 5798)

In [39]:
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,time_morning,time_night,time_noon,age_0-20,age_21-30,age_31-45,age_46-60,age_60-70,age_70-100,sentiment
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,neutral
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,positive
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,negative
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,positive
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,positive


# Testing

In [83]:
def check(df, test_size, random_state, model):
    x_train, x_test, y_train, y_test = train_test_split(
        df.drop('sentiment', axis = 1),
        df['sentiment'],
        test_size = test_size,
        random_state = random_state
    )
    model.fit(x_train, y_train)
    print(f'''Results for {model.__str__()} at {test_size}/{random_state}:
    Test: {accuracy_score(y_test, model.predict(x_test))}
    Train: {accuracy_score(y_train, model.predict(x_train))}\n''')

In [85]:
check(df_final, 0.3, 42, MultinomialNB())
check(df_final, 0.3, 50, MultinomialNB())
check(df_final, 0.25, 50, MultinomialNB())
check(df_final, 0.28, 42, MultinomialNB())

check(df_final, 0.3, 42, LogisticRegression(max_iter = 500))
check(df_final, 0.3, 50, LogisticRegression(max_iter = 500))
check(df_final, 0.25, 50, LogisticRegression(max_iter = 500))
check(df_final, 0.28, 42, LogisticRegression(max_iter = 500))

Results for MultinomialNB() at 0.3/42:
    Test: 0.6154571159283695
    Train: 0.9017387788111605

Results for MultinomialNB() at 0.3/50:
    Test: 0.5994344957587182
    Train: 0.9118479579458147

Results for MultinomialNB() at 0.25/50:
    Test: 0.5984162895927602
    Train: 0.9037735849056604

Results for MultinomialNB() at 0.28/42:
    Test: 0.6151515151515151
    Train: 0.9044811320754716

Results for LogisticRegression(max_iter=500) at 0.3/42:
    Test: 0.6324222431668237
    Train: 0.9688637282652649

Results for LogisticRegression(max_iter=500) at 0.3/50:
    Test: 0.6390197926484449
    Train: 0.9700768297614234

Results for LogisticRegression(max_iter=500) at 0.25/50:
    Test: 0.6481900452488688
    Train: 0.9671698113207547

Results for LogisticRegression(max_iter=500) at 0.28/42:
    Test: 0.6323232323232323
    Train: 0.967374213836478



# Conclusion 🤷‍♂

In [87]:
# LogReg gave relatively better results