In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

In [2]:
df_val = pd.read_csv('data/twitter_validation.csv')
df_train = pd.read_csv('data/twitter_training.csv')
df_val.drop(['id','game'],axis=1,inplace=True)
df_train.drop(['id','game'],axis=1,inplace=True)
df_val.head()

Unnamed: 0,sentiment,text
0,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...


In [34]:
df_train.head()

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
# helper function to clean tweets
def processTweet(tweet):
    if isinstance(tweet, float):
        return str(tweet)
    # remove user handles tagged in the tweet
    tweet = re.sub('@[^\s]+','',tweet)
    # remove words that start with th dollar sign    
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    tweet = re.sub(r'(?:^|[\s,])([\w-]+\.[a-z]{2,}\S*)\b','',tweet)
    # remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # remove all kinds of punctuations and special characters
    punkt = string.punctuation + r'''`‘’)(+÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”،.”…“–ـ”.°ा'''
    tweet = tweet.translate(str.maketrans('', '', punkt))
    # remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    # remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # remove stopwords
    tweet = re.sub(r'\b('+ '|'.join(stopword for stopword in stopwords.words('english'))+ r')\b', '', tweet)
    # remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    tweet = tweet.rstrip(' ')
    # remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uffff')
    tweet = re.sub(r'([^\u1F600-\u1F6FF\s])','', tweet)
    # lowercase
    tweet = tweet.lower()
    # remove extra spaces
    tweet = re.sub(r'[\s]{2, }', ' ', tweet)
    
    return tweet

df_val['clean_text'] = df_val['text'].apply(processTweet)
df_val['label'] = LabelEncoder().fit_transform(df_val['label'])
df_val.drop_duplicates(subset=['clean_text'],inplace=True)
df_val

Unnamed: 0,sentiment,text,clean_text
0,0,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation r...
1,2,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,1,@Microsoft Why do I pay for WORD when it funct...,why pay word functions poorly chromebook
3,1,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly aw...
4,2,Now the President is slapping Americans in the...,now president slapping americans face reall...
...,...,...,...
995,0,⭐️ Toronto is the arts and culture capital of ...,toronto arts culture capital canada wonder...
996,0,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,this actually good move tot bring more viewers...
997,3,Today sucked so it’s time to drink wine n play...,today sucked time drink wine play borderlands...
998,3,Bought a fraction of Microsoft today. Small wins.,bought fraction microsoft today small wins


In [36]:
df_val.drop(['text'],axis=1,inplace=True)
df_val.rename(columns={"clean_text": "text"},inplace=True)
df_val.dropna(inplace=True)
df_val.drop(df_val[df_val['text'] == ''].index, inplace = True)
df_val.drop(df_val[df_val['text'] == ' '].index, inplace = True)
df_val.drop(df_val[df_val['text'] == 'nan'].index, inplace = True)
df_val.to_csv('data/cleaned_twitter_validation.csv',index=False)
df_val.head()

Unnamed: 0,sentiment,text
0,0,mentioned facebook struggling motivation r...
1,2,bbc news amazon boss jeff bezos rejects claims...
2,1,why pay word functions poorly chromebook
3,1,csgo matchmaking full closet hacking truly aw...
4,2,now president slapping americans face reall...


In [4]:
df_train['clean_text'] = df_train['text'].apply(processTweet)
df_train['label'] = LabelEncoder().fit_transform(df_train['label'])
df_train.drop_duplicates(subset=['clean_text'],inplace=True)
df_train.head()
df_train.drop(['text'],axis=1,inplace=True)
df_train.rename(columns={"clean_text": "text"},inplace=True)
df_train.dropna(inplace=True)
df_train.drop(df_train[df_train['text'] == ''].index, inplace = True)
df_train.drop(df_train[df_train['text'] == ' '].index, inplace = True)
df_train.drop(df_train[df_train['text'] == 'nan'].index, inplace = True)
df_train.to_csv('data/cleaned_twitter_training.csv',index=False)
df_val.isnull().sum()

sentiment     0
text          0
clean_text    0
dtype: int64