#Upload Data Set

In [1]:
!pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification

Downloading covid-19-nlp-text-classification.zip to /content
  0% 0.00/4.38M [00:00<?, ?B/s]
100% 4.38M/4.38M [00:00<00:00, 64.8MB/s]


In [6]:
!unzip \*.zip && rm *.zip

Archive:  covid-19-nlp-text-classification.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


#Adding Required Libraries

In [7]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#Understanding Data

In [8]:
train = pd.read_csv("Corona_NLP_train.csv",encoding='ISO-8859-1')
test = pd.read_csv("Corona_NLP_test.csv",encoding='ISO-8859-1')

In [9]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


In [12]:
train.isnull().sum(axis = 0)

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [13]:
test.isnull().sum(axis = 0)

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64

In [14]:
train["Sentiment"].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [15]:
test["Sentiment"].value_counts()

Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: Sentiment, dtype: int64

In [16]:
fig=make_subplots(1,2,subplot_titles=('Train set','Test set'))
x=train["Sentiment"].value_counts()
fig.add_trace(go.Bar(x=x.index,y=x.values,marker_color= ["forestgreen","lightgreen","lightseagreen","red","darkred"],name='train'),row=1,col=1)
x=test["Sentiment"].value_counts()
fig.add_trace(go.Bar(x=x.index,y=x.values,marker_color=["forestgreen","lightgreen","lightseagreen","red","darkred"],name='test'),row=1,col=2)

In [17]:
train_3s = train.copy()
test_3s = test.copy()

In [18]:
five2three = {"Extremely Positive":"Positive","Positive":"Positive","Neutral":"Neutral","Negative":"Negative","Extremely Negative":"Negative"}
train_3s["Sentiment"] = [five2three[x] for x in train_3s["Sentiment"]]
test_3s["Sentiment"] = [five2three[x] for x in test_3s["Sentiment"]]

In [19]:
train_3s["Sentiment"].value_counts()

Positive    18046
Negative    15398
Neutral      7713
Name: Sentiment, dtype: int64

In [20]:
fig=make_subplots(1,2,subplot_titles=('Train set','Test set'))
x=train_3s["Sentiment"].value_counts()
fig.add_trace(go.Bar(x=x.index,y=x.values,marker_color= ["forestgreen","lightseagreen","red"],name='train'),row=1,col=1)
x=test_3s["Sentiment"].value_counts()
fig.add_trace(go.Bar(x=x.index,y=x.values,marker_color=["forestgreen","lightseagreen","red"],name='test'),row=1,col=2)

#Cleaning & Normalization 1



*   Deleting Duplicate Tweets
*   Deleting Unnecessary Columns
*   3 and 5 Sentiment Datasets


### Copy of train and test data

In [21]:
train_n1 = train.copy()
test_n1 = test.copy()

### Deleting Duplicate Tweets

In [22]:
train_n1.drop_duplicates(subset="OriginalTweet",inplace=True)
test_n1.drop_duplicates(subset="OriginalTweet",inplace=True)

### Deleting Unnecessary Columns

In [23]:
train_n1 = train_n1[["OriginalTweet","Sentiment"]]
test_n1 = test_n1[["OriginalTweet","Sentiment"]]

In [24]:
train_n1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41157 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  41157 non-null  object
 1   Sentiment      41157 non-null  object
dtypes: object(2)
memory usage: 964.6+ KB


In [25]:
test_n1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3798 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  3798 non-null   object
 1   Sentiment      3798 non-null   object
dtypes: object(2)
memory usage: 89.0+ KB


### Creating 3 Sentiment Dataset

In [26]:
train_n1_3s = train_n1.copy()
test_n1_3s = test_n1.copy()

In [27]:
five2three = {"Extremely Positive":"Positive","Positive":"Positive","Neutral":"Neutral","Negative":"Negative","Extremely Negative":"Negative"}
train_n1_3s["Sentiment"] = [five2three[x] for x in train_n1_3s["Sentiment"]]
test_n1_3s["Sentiment"] = [five2three[x] for x in test_n1_3s["Sentiment"]]

In [28]:
train_n1_3s["Sentiment"].value_counts()

Positive    18046
Negative    15398
Neutral      7713
Name: Sentiment, dtype: int64

### Save New Datasets

In [29]:
train_n1.to_csv("COVID19_train_N1_S5.csv", index = False)
test_n1.to_csv("COVID19_test_N1_S5.csv", index = False)
test_n1_3s.to_csv("COVID19_test_N1_S3.csv", index = False)
train_n1_3s.to_csv("COVID19_train_N1_S3.csv",index = False)