## <span style='color:green'> Task 3 : Data Exploration & Preprocessing, Topic Modeling & Sentiment Analysis</span>

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [26]:
# read the csv file generated from clean_tweets_dataframe.py
tweets_df = pd.read_csv("data/clean_processed_tweet_data.csv")

# 3.1 Data Exploration

In [27]:
# display the first 5 rows from our dataset
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,subjectivity,polarity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,0,2022-08-07 22:31:20+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,0.190625,-0.125,en,4,2,i_ameztoy,20497,2621,,City,i_ameztoy,
1,1,2022-08-07 22:31:16+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,0.1,-0.1,en,691,201,ZIisq,65,272,,"China, Taiwan",IndoPac_Info,
2,2,2022-08-07 22:31:07+00:00,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,0,0,Fin21Free,85,392,,XiJinping,ZelenskyyUa,Netherlands
3,3,2022-08-07 22:31:06+00:00,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.35,0.1,en,0,0,Fin21Free,85,392,,XiJinping,,Netherlands
4,4,2022-08-07 22:31:04+00:00,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",0.55625,-6.938894e-18,en,1521,381,VizziniDolores,910,2608,,,ChinaUncensored,"Ayent, Schweiz"


In [28]:
# display dataframe information
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          22000 non-null  int64  
 1   created_at          22000 non-null  object 
 2   source              22000 non-null  object 
 3   original_text       22000 non-null  object 
 4   subjectivity        22000 non-null  float64
 5   polarity            22000 non-null  float64
 6   lang                22000 non-null  object 
 7   favorite_count      22000 non-null  int64  
 8   retweet_count       22000 non-null  int64  
 9   original_author     22000 non-null  object 
 10  followers_count     22000 non-null  int64  
 11  friends_count       22000 non-null  int64  
 12  possibly_sensitive  6191 non-null   object 
 13  hashtags            19141 non-null  object 
 14  user_mentions       18698 non-null  object 
 15  place               12107 non-null  object 
dtypes: f

In [29]:
# check null values
tweets_df.isna()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,subjectivity,polarity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
21996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21997,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
21998,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [30]:
# shape of the dataframe
tweets_df.shape

(22000, 16)

In [31]:
# show columns of the dataframe
tweets_df.columns

Index(['Unnamed: 0', 'created_at', 'source', 'original_text', 'subjectivity',
       'polarity', 'lang', 'favorite_count', 'retweet_count',
       'original_author', 'followers_count', 'friends_count',
       'possibly_sensitive', 'hashtags', 'user_mentions', 'place'],
      dtype='object')

# 3.2 Data Preprocessing

In [32]:
# add new column named clean_text to store cleaned original text
tweets_df.insert(4,column = 'clean_text',value = tweets_df['original_text'])

In [33]:
# add new column named sentimnt to where the text is positive,negative or neutral
tweets_df.insert(7,column = 'sentiment',value = tweets_df['polarity'])

In [34]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,clean_text,subjectivity,polarity,sentiment,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,0,2022-08-07 22:31:20+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,RT @i_ameztoy: Extra random image (I):\n\nLets...,0.190625,-0.125,-0.125,en,4,2,i_ameztoy,20497,2621,,City,i_ameztoy,
1,1,2022-08-07 22:31:16+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,RT @IndoPac_Info: #China's media explains the ...,0.1,-0.1,-0.1,en,691,201,ZIisq,65,272,,"China, Taiwan",IndoPac_Info,
2,2,2022-08-07 22:31:07+00:00,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...","China even cut off communication, they don't a...",0.0,0.0,0.0,en,0,0,Fin21Free,85,392,,XiJinping,ZelenskyyUa,Netherlands
3,3,2022-08-07 22:31:06+00:00,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...","Putin to #XiJinping : I told you my friend, Ta...",0.35,0.1,0.1,en,0,0,Fin21Free,85,392,,XiJinping,,Netherlands
4,4,2022-08-07 22:31:04+00:00,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",0.55625,-6.938894e-18,-6.938894e-18,en,1521,381,VizziniDolores,910,2608,,,ChinaUncensored,"Ayent, Schweiz"


# 3.3 Topic Modelling

# 3.4 Sentiment Analysis