In [27]:
import nltk
import re

In [28]:
import pandas as pd
# Reading Tab separated Value
data = pd.read_csv('Dataset.csv', names=['body_text','label'], header=None)
# Print first 5 data
data.head()

Unnamed: 0,body_text,label
0,This totally topped off my week :') #sarcasm,NOT_SARCASM
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM
4,I just love missing the bus! ☺ #sarcasm,SARCASM


In [29]:
#Cleaning Data
def clean_text(text):
    #remove #
    t=0
    j=""
    for i in text:
        if(i=='#'):
            t=1
        if(i==" "):
            t=0
        if(t==0):
            j=j+i
    text=j
    #remove links
    text = re.sub(r"http\S+", "", text)
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text


data['body_text_clean'] = data['body_text'].apply(lambda x: clean_text(x))

data.head()

Unnamed: 0,body_text,label,body_text_clean
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus


In [30]:
#Tokenization
import numpy as np
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

data['body_text_tokenized'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower())) 
#We convert to lower as Python is case-sensitive. 
data.head()



Unnamed: 0,body_text,label,body_text_clean,body_text_tokenized
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week,"[this, totally, topped, off, my, week]"
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that,"[wow, how, freaking, awesome, was, that]"
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...,"[summers, over, and, it, was, shit, only, sunn..."
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...,"[did, you, know, that, one, of, bolshoi, balle..."
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus,"[i, just, love, missing, the, bus]"


In [31]:
#Remoing stopwords
stopword = nltk.corpus.stopwords.words('english')# All English Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    return text

data['body_text_nostop'] = data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,body_text,label,body_text_clean,body_text_tokenized,body_text_nostop
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week,"[this, totally, topped, off, my, week]","[totally, topped, week]"
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that,"[wow, how, freaking, awesome, was, that]","[wow, freaking, awesome]"
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...,"[summers, over, and, it, was, shit, only, sunn...","[summers, shit, sunny, day, bbq, north, west, ..."
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...,"[did, you, know, that, one, of, bolshoi, balle...","[know, one, bolshoi, ballet, members, born, ma..."
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus,"[i, just, love, missing, the, bus]","[love, missing, bus]"


In [32]:
#Stemming
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['body_text_stemmed'] = data['body_text_nostop'].apply(lambda x: stemming(x))

data.head()

Unnamed: 0,body_text,label,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week,"[this, totally, topped, off, my, week]","[totally, topped, week]","[total, top, week]"
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that,"[wow, how, freaking, awesome, was, that]","[wow, freaking, awesome]","[wow, freak, awesom]"
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...,"[summers, over, and, it, was, shit, only, sunn...","[summers, shit, sunny, day, bbq, north, west, ...","[summer, shit, sunni, day, bbq, north, west, k..."
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...,"[did, you, know, that, one, of, bolshoi, balle...","[know, one, bolshoi, ballet, members, born, ma...","[know, one, bolshoi, ballet, member, born, mar..."
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus,"[i, just, love, missing, the, bus]","[love, missing, bus]","[love, miss, bu]"


In [36]:
def pos(text):
    tokens=nltk.pos_tag(text)
    m=[]
    l=""
    for i in tokens:
        l=i[0]+'_'+i[1]
        m.append(l)
        l=""
    return m
data['body_text_pos'] = data['body_text_stemmed'].apply(lambda x: pos(x))

data.head()

Unnamed: 0,body_text,label,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_pos
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week,"[this, totally, topped, off, my, week]","[totally, topped, week]","[total, top, week]","[total_JJ, top_JJ, week_NN]"
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that,"[wow, how, freaking, awesome, was, that]","[wow, freaking, awesome]","[wow, freak, awesom]","[wow_NN, freak_NN, awesom_NN]"
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...,"[summers, over, and, it, was, shit, only, sunn...","[summers, shit, sunny, day, bbq, north, west, ...","[summer, shit, sunni, day, bbq, north, west, k...","[summer_NN, shit_NNS, sunni_JJ, day_NN, bbq_VB..."
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...,"[did, you, know, that, one, of, bolshoi, balle...","[know, one, bolshoi, ballet, members, born, ma...","[know, one, bolshoi, ballet, member, born, mar...","[know_VB, one_CD, bolshoi_JJ, ballet_NN, membe..."
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus,"[i, just, love, missing, the, bus]","[love, missing, bus]","[love, miss, bu]","[love_VB, miss_NN, bu_NN]"


In [37]:
data.to_csv("Tweets_cleaned.csv", sep=',')

In [41]:
def join(tokens):
    l=""
    for i in tokens:
        l=l+" "+i
    return l
    
data['body_text_join'] = data['body_text_pos'].apply(lambda x: join(x))
data.head()


Unnamed: 0,body_text,label,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_pos,body_text_join
0,This totally topped off my week :') #sarcasm,NOT_SARCASM,this totally topped off my week,"[this, totally, topped, off, my, week]","[totally, topped, week]","[total, top, week]","[total_JJ, top_JJ, week_NN]",total_JJ top_JJ week_NN
1,Wow.. How freaking awesome was that. #Sarcasm,NOT_SARCASM,wow how freaking awesome was that,"[wow, how, freaking, awesome, was, that]","[wow, freaking, awesome]","[wow, freak, awesom]","[wow_NN, freak_NN, awesom_NN]",wow_NN freak_NN awesom_NN
2,Summers over and it was shit. Only sunny day w...,NOT_SARCASM,summers over and it was shit only sunny day wa...,"[summers, over, and, it, was, shit, only, sunn...","[summers, shit, sunny, day, bbq, north, west, ...","[summer, shit, sunni, day, bbq, north, west, k...","[summer_NN, shit_NNS, sunni_JJ, day_NN, bbq_VB...",summer_NN shit_NNS sunni_JJ day_NN bbq_VBD no...
3,Did you know that one of Bolshoi Ballet member...,NOT_SARCASM,did you know that one of bolshoi ballet member...,"[did, you, know, that, one, of, bolshoi, balle...","[know, one, bolshoi, ballet, members, born, ma...","[know, one, bolshoi, ballet, member, born, mar...","[know_VB, one_CD, bolshoi_JJ, ballet_NN, membe...",know_VB one_CD bolshoi_JJ ballet_NN member_NN...
4,I just love missing the bus! ☺ #sarcasm,SARCASM,i just love missing the bus,"[i, just, love, missing, the, bus]","[love, missing, bus]","[love, miss, bu]","[love_VB, miss_NN, bu_NN]",love_VB miss_NN bu_NN


In [45]:
data.to_csv("Fin_Tweets_cleaned.csv", sep=',')