In [1]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT, force_remount=True)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [2]:
%pwd
%cd 'drive/My Drive/Workspaces/twitter_sentiment_analysis'

/content/drive/My Drive/Workspaces/twitter_sentiment_analysis


# EDA and Cleaning
1. Importing data
2. Exploring data
3. Cleaning data
4. Saving data

In [4]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [6]:
train_df['tweet'] = train_df['tweet'].str.replace('@user','')

## Target distribution

In [9]:
train_df.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

There is a target imbalance in the data.
Need to be handled later on.

In [12]:
train_df.tweet.isnull().any()

False

There is no missing data present

### Since it is twitter data. Hashtags are important.
Get the hashtags and analyse the effect.

In [17]:
train_df['hashtags'] = train_df.tweet.apply(lambda tweet: ','.join([word[1:] for word in tweet.split() if '#' == word[0]]))

In [31]:
no_hashtags = train_df[train_df.hashtags == ''].hashtags.count()
hashtags = train_df[~(train_df.hashtags == '')].hashtags.count()
print('{}% of tweets have hashtags.'.format(round(100*(hashtags/(hashtags + no_hashtags)),2)))

72.98% of tweets have hashtags.


In [33]:
train_df[train_df.hashtags == ''].label.value_counts()

0    8024
1     613
Name: label, dtype: int64

In [24]:
train_df[~(train_df.hashtags == '')].hashtags.count()

23325

## Data Cleaning

In [61]:
train_df['cleaned_tweet'] = train_df['tweet']

In [None]:
!pip install contractions

In [62]:
#Expand contractions
import contractions
train_df['cleaned_tweet'] = train_df['cleaned_tweet'].apply(lambda tweet: contractions.fix(tweet).lower())

In [64]:
# Remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 
train_df['cleaned_tweet'] = train_df['cleaned_tweet'].apply(lambda tweet: ' '.join([word for word in tweet.split() if not word in stop_words]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
# Remove junk characters - encode and decode
train_df['cleaned_tweet'] = train_df.cleaned_tweet.apply(lambda tweet : tweet.encode('ascii','ignore').decode('ascii'))

In [69]:
#Remove punctuations
import re
train_df['cleaned_tweet'] = train_df.cleaned_tweet.apply(lambda tweet : re.sub(r'[^\w\s]','',tweet))

In [None]:
#Lemmatization

In [70]:
train_df.head(10)

Unnamed: 0,id,label,tweet,hashtags,cleaned_tweet
0,1,0,when a father is dysfunctional and is so sel...,run,father dysfunctional selfish drags kids dysfun...
1,2,0,thanks for #lyft credit i can't use cause th...,"lyft,disapointed,getthanked",thanks lyft credit use offer wheelchair vans p...
2,3,0,bihday your majesty,,bihday majesty
3,4,0,#model i love u take with u all the time in ...,model,model love take time ur
4,5,0,factsguide: society now #motivation,motivation,factsguide society motivation
5,6,0,[2/2] huge fan fare and big talking before the...,allshowandnogo,22 huge fan fare big talking leave chaos pay d...
6,7,0,camping tomorrow dannyâ¦,,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,"school,exams,hate,imagine,actorslife,revolutio...",next school year year exams think school exam...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,"allin,cavs,champions,cleveland,clevelandcavaliers",won love land allin cavs champions cleveland c...
9,10,0,welcome here ! i'm it's so #gr8 !,gr8,welcome gr8
