In [159]:
import numpy as np
import re
import pandas as pd
import string
import seaborn as sb
from gensim.models import Word2Vec, KeyedVectors


In [160]:
#reading the data file
df = pd.read_csv('train.csv')


In [161]:
#dropping the unwanted column data and converting the text into lowercase
dropped_columns = df.drop(columns=['keyword','location'])
dropped_columns['text'] = dropped_columns['text'].str.lower()

In [162]:
#function to remove punctuation from the text column 
def eliminate_punctuation(column):
    return column.translate(str.maketrans("","", string.punctuation))
dropped_columns['text'].apply(lambda text: eliminate_punctuation(text))


0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609                                                 name
7610    m194 0104 utc5km s of volcano hawaii httptcozd...
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [163]:
#function to get rid of special characters, digits and URLs
def preprocess(column):
    temp = column
    temp = re.sub('\n', " ", temp)
    temp =re.sub('\'', "", temp)
    temp = re.sub('-', " ", temp)
    temp = re.sub('#'," ", temp)
    temp = re.sub(r"(http|https|pic.)\S+"," ",temp)
    temp = re.sub(r'[^\w\s]',' ',temp)
    temp = re.sub("(\d+)","",temp)
    #print("this is the cleaned text", temp)
    return temp

dropped_columns['text'] =  dropped_columns['text'].apply(lambda text:preprocess(text))

In [164]:
dropped_columns['text']

0       our deeds are the reason of this  earthquake m...
1                  forest fire near la ronge sask  canada
2       all residents asked to shelter in place are be...
3         people receive  wildfires evacuation orders ...
4       just got sent this photo from ruby  alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609                                                name 
7610                 m     utc  km s of volcano hawaii   
7611    police investigating after an e bike collided ...
7612    the latest  more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [165]:
from nltk.corpus import stopwords 
STOPWORDS = set(stopwords.words('english'))

In [166]:
def remove_stopwords(column):
    return ' '.join([word for word in column.split() if word not in (STOPWORDS)])

In [167]:
dropped_columns['text'] = dropped_columns['text'].apply(lambda text: remove_stopwords(text))

In [168]:
dropped_columns.head()


Unnamed: 0,id,text,target
0,1,deeds reason earthquake may allah forgive us,1
1,4,forest fire near la ronge sask canada,1
2,5,residents asked shelter place notified officer...,1
3,6,people receive wildfires evacuation orders cal...,1
4,7,got sent photo ruby alaska smoke wildfires pou...,1


In [188]:
from nltk.stem import WordNetLemmatizer

root_word = WordNetLemmatizer()
def lemmatizer(column):
    return " ".join([root_word.lemmatize(word) for word in column.split()])

In [189]:
dropped_columns['text'] = dropped_columns['text'].apply(lambda text: lemmatizer(text))

In [191]:
dropped_columns

Unnamed: 0,id,text,target
0,1,deed reason earthquake may allah forgive u,1
1,4,forest fire near la ronge sask canada,1
2,5,resident asked shelter place notified officer ...,1
3,6,people receive wildfire evacuation order calif...,1
4,7,got sent photo ruby alaska smoke wildfire pour...,1
...,...,...,...
7608,10869,two giant crane holding bridge collapse nearby...,1
7609,10870,name,1
7610,10871,utc km volcano hawaii,1
7611,10872,police investigating e bike collided car littl...,1


In [190]:
root_word.lemmatize("notified", "v")

'notify'