# Data Identification

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dirty-en.csv', encoding = 'latin-1')
df.head()

Unnamed: 0,Date,Username,Tweet
0,2021-11-29T16:44:27.000Z,"Jack\n@_JackRFC_\nÂ·\nNov 29, 2021",Might just sack everything off and go and live...
1,2021-11-29T15:36:51.000Z,"Muki\n@Mukila19\nÂ·\nNov 29, 2021",3 rd island Aachu arjun sir should have questi...
2,2021-11-29T14:50:23.000Z,"BALI Awards\n@BALI_Awards\nÂ·\nNov 29, 2021",Reply with your favourite BALI Awards memories...
3,2021-11-29T14:26:04.000Z,No Base! æ²ç¸ Okinawa\n@nobaseyellow\nÂ·\nNo...,But are Japanese free to pop over to Hawaii fo...
4,2021-11-29T14:24:28.000Z,Mythological Africans\n@MythicAfricans\nÂ·\nNo...,#MythologyMonday\nThis myth explains the origi...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26762 entries, 0 to 26761
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      26762 non-null  object
 1   Username  26762 non-null  object
 2   Tweet     26762 non-null  object
dtypes: object(3)
memory usage: 627.4+ KB


# Data Cleaning

In [4]:
df.drop_duplicates(inplace = True)

In [5]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20131 entries, 0 to 26761
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      20131 non-null  object
 1   Username  20131 non-null  object
 2   Tweet     20131 non-null  object
dtypes: object(3)
memory usage: 629.1+ KB


Unnamed: 0,Date,Username,Tweet
0,2021-11-29T16:44:27.000Z,"Jack\n@_JackRFC_\nÂ·\nNov 29, 2021",Might just sack everything off and go and live...
1,2021-11-29T15:36:51.000Z,"Muki\n@Mukila19\nÂ·\nNov 29, 2021",3 rd island Aachu arjun sir should have questi...
2,2021-11-29T14:50:23.000Z,"BALI Awards\n@BALI_Awards\nÂ·\nNov 29, 2021",Reply with your favourite BALI Awards memories...
3,2021-11-29T14:26:04.000Z,No Base! æ²ç¸ Okinawa\n@nobaseyellow\nÂ·\nNo...,But are Japanese free to pop over to Hawaii fo...
4,2021-11-29T14:24:28.000Z,Mythological Africans\n@MythicAfricans\nÂ·\nNo...,#MythologyMonday\nThis myth explains the origi...


In [6]:
import re
from nltk.tokenize import word_tokenize

In [7]:
def tweet_cleansing(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace(',', '')
    tweet = re.sub('(www\.[^s]+ | (https?://[^\s]+))', 'URL', tweet)
    tweet = re.sub('[^\w]', ' ', tweet)
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet)
    tweet = re.sub('#([^\s]+)', '\1', tweet)
    tweet = re.sub('[^a-zA-Z#]', ' ', tweet)
    tweet = tweet.strip('\'"')
    return tweet

In [8]:
df['cleaned_tweet'] = df.Tweet.apply(tweet_cleansing).values
df['cleaned_tweet'] = df.cleaned_tweet.apply(lambda x: ' '.join([j for j in x.split() if len(j)>3]))

In [9]:
df['len_words'] = df.cleaned_tweet.apply(lambda x: len(x))

In [10]:
df.head()

Unnamed: 0,Date,Username,Tweet,cleaned_tweet,len_words
0,2021-11-29T16:44:27.000Z,"Jack\n@_JackRFC_\nÂ·\nNov 29, 2021",Might just sack everything off and go and live...,might just sack everything live beach bali som...,52
1,2021-11-29T15:36:51.000Z,"Muki\n@Mukila19\nÂ·\nNov 29, 2021",3 rd island Aachu arjun sir should have questi...,island aachu arjun should have questioned viji...,206
2,2021-11-29T14:50:23.000Z,"BALI Awards\n@BALI_Awards\nÂ·\nNov 29, 2021",Reply with your favourite BALI Awards memories...,reply with your favourite bali awards memories...,73
3,2021-11-29T14:26:04.000Z,No Base! æ²ç¸ Okinawa\n@nobaseyellow\nÂ·\nNo...,But are Japanese free to pop over to Hawaii fo...,japanese free over hawaii trip bali,35
4,2021-11-29T14:24:28.000Z,Mythological Africans\n@MythicAfricans\nÂ·\nNo...,#MythologyMonday\nThis myth explains the origi...,mythologymonday this myth explains origins rac...,223


In [11]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
stemmer = PorterStemmer()
stopword = set(stopwords.words('english'))
lemmatized = WordNetLemmatizer()

In [13]:
def tokenizerku(tweet):
    words = word_tokenize(tweet)
    tokens = []
    for i in words:
        pattern = re.compile(r'(.)\1{1,}', re.DOTALL)
        i = pattern.sub(r'\1\1', i)
        i = i.strip('\'"?,.')
        alp = re.search(r'^[a-zA-Z0-9][a-zA-Z0-9-]*$', i)
        if(i in ['AT_USER', 'URL'] or alp is None):
            continue
        else:
            i = lemmatized.lemmatize(i)
            tokens.append(i.lower())
    return token

In [14]:
df['cleaned_tweet'] = df.cleaned_tweet.apply(lambda x: ' '.join([s for s in x.split() if s not in stopword]))
df['cleaned_tweet'] = df.cleaned_tweet.apply(lambda x: stemmer.stem(x))

In [15]:
df.head()

Unnamed: 0,Date,Username,Tweet,cleaned_tweet,len_words
0,2021-11-29T16:44:27.000Z,"Jack\n@_JackRFC_\nÂ·\nNov 29, 2021",Might just sack everything off and go and live...,might sack everything live beach bali someth,52
1,2021-11-29T15:36:51.000Z,"Muki\n@Mukila19\nÂ·\nNov 29, 2021",3 rd island Aachu arjun sir should have questi...,island aachu arjun questioned viji gave left r...,206
2,2021-11-29T14:50:23.000Z,"BALI Awards\n@BALI_Awards\nÂ·\nNov 29, 2021",Reply with your favourite BALI Awards memories...,reply favourite bali awards memories take trip...,73
3,2021-11-29T14:26:04.000Z,No Base! æ²ç¸ Okinawa\n@nobaseyellow\nÂ·\nNo...,But are Japanese free to pop over to Hawaii fo...,japanese free hawaii trip bali,35
4,2021-11-29T14:24:28.000Z,Mythological Africans\n@MythicAfricans\nÂ·\nNo...,#MythologyMonday\nThis myth explains the origi...,mythologymonday myth explains origins races ku...,223


In [16]:
def tokenized_text(tweet):
    tokenized_text = tweet.split()
    return tokenized_text
df['tokens'] = df['cleaned_tweet'].apply(lambda x: tokenized_text(x))

In [17]:
df.head()

Unnamed: 0,Date,Username,Tweet,cleaned_tweet,len_words,tokens
0,2021-11-29T16:44:27.000Z,"Jack\n@_JackRFC_\nÂ·\nNov 29, 2021",Might just sack everything off and go and live...,might sack everything live beach bali someth,52,"[might, sack, everything, live, beach, bali, s..."
1,2021-11-29T15:36:51.000Z,"Muki\n@Mukila19\nÂ·\nNov 29, 2021",3 rd island Aachu arjun sir should have questi...,island aachu arjun questioned viji gave left r...,206,"[island, aachu, arjun, questioned, viji, gave,..."
2,2021-11-29T14:50:23.000Z,"BALI Awards\n@BALI_Awards\nÂ·\nNov 29, 2021",Reply with your favourite BALI Awards memories...,reply favourite bali awards memories take trip...,73,"[reply, favourite, bali, awards, memories, tak..."
3,2021-11-29T14:26:04.000Z,No Base! æ²ç¸ Okinawa\n@nobaseyellow\nÂ·\nNo...,But are Japanese free to pop over to Hawaii fo...,japanese free hawaii trip bali,35,"[japanese, free, hawaii, trip, bali]"
4,2021-11-29T14:24:28.000Z,Mythological Africans\n@MythicAfricans\nÂ·\nNo...,#MythologyMonday\nThis myth explains the origi...,mythologymonday myth explains origins races ku...,223,"[mythologymonday, myth, explains, origins, rac..."


# Data Saving

In [18]:
df = pd.DataFrame(df)
df.to_csv('en-cleaned-tweet.csv')