# Imports

In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
import string
import re
from spellchecker import SpellChecker # pip install pyspellchecker
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paqui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
tweet= pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
tweet.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


# Data Cleaning

### Removing Uppercase

In [65]:
def lower_case(text):
    lower_text = text.lower()
    return lower_text

In [66]:
tweet['text']=tweet['text'].apply(lambda x : lower_case(x))
test['text']=test['text'].apply(lambda x : lower_case(x))

<bound method NDFrame.head of 0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
3258    earthquake safety los angeles ûò safety faste...
3259    storm in ri worse than last hurricane. my city...
3260    green line derailment in chicago http://t.co/u...
3261    meg issues hazardous weather outlook (hwo) htt...
3262    #cityofcalgary has activated its municipal eme...
Name: text, Length: 10876, dtype: object>

### Removing urls

In [67]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [68]:
tweet['text']=tweet['text'].apply(lambda x : remove_URL(x))
test['text']=test['text'].apply(lambda x : remove_URL(x))

<bound method NDFrame.head of 0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
3258    earthquake safety los angeles ûò safety faste...
3259    storm in ri worse than last hurricane. my city...
3260                    green line derailment in chicago 
3261          meg issues hazardous weather outlook (hwo) 
3262    #cityofcalgary has activated its municipal eme...
Name: text, Length: 10876, dtype: object>

### Removing HTML tags

In [69]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [70]:
tweet['text']=tweet['text'].apply(lambda x : remove_html(x))
test['text']=test['text'].apply(lambda x : remove_html(x))

<bound method NDFrame.head of 0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
3258    earthquake safety los angeles ûò safety faste...
3259    storm in ri worse than last hurricane. my city...
3260                    green line derailment in chicago 
3261          meg issues hazardous weather outlook (hwo) 
3262    #cityofcalgary has activated its municipal eme...
Name: text, Length: 10876, dtype: object>

### Removing emojis

In [71]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [72]:
tweet['text']=tweet['text'].apply(lambda x: remove_emoji(x))
test['text']=test['text'].apply(lambda x: remove_emoji(x))

<bound method NDFrame.head of 0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
3258    earthquake safety los angeles ûò safety faste...
3259    storm in ri worse than last hurricane. my city...
3260                    green line derailment in chicago 
3261          meg issues hazardous weather outlook (hwo) 
3262    #cityofcalgary has activated its municipal eme...
Name: text, Length: 10876, dtype: object>

### Removing punctuation

In [73]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [74]:
tweet['text']=tweet['text'].apply(lambda x : remove_punct(x))
test['text']=test['text'].apply(lambda x : remove_punct(x))

<bound method NDFrame.head of 0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
3258    earthquake safety los angeles ûò safety faste...
3259    storm in ri worse than last hurricane my citya...
3260                    green line derailment in chicago 
3261            meg issues hazardous weather outlook hwo 
3262    cityofcalgary has activated its municipal emer...
Name: text, Length: 10876, dtype: object>

### Spelling Check

In [75]:
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [78]:
test['text'] = test['text'].apply(lambda x : correct_spellings(x))

In [79]:
tweet['text'] = tweet['text'].apply(lambda x : correct_spellings(x))

# Tokens

### Tokenization

In [80]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tweet['text'] = tweet['text'].apply(lambda x: tokenizer.tokenize(x))
test['text'] = test['text'].apply(lambda x: tokenizer.tokenize(x))

### Removing stop words

In [81]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [82]:
tweet['text'] = tweet['text'].apply(lambda x : remove_stopwords(x))
test['text'] = test['text'].apply(lambda x : remove_stopwords(x))

### Token Normalization

In [83]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

In [84]:
tweet['text'] = tweet['text'].apply(lambda x : combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))

# Example of result

Before

In [85]:
tweet['text']

0            deeds reason earthquake may allah forgive us
1                    forest fire near la range ask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfire evacuation order...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                        m4 0104 utc5km volcano hawaii
7611    police investigating bike collided car little ...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

After

In [86]:
tweet.iloc[70]['text']

'personalinjury accident summer read advice amp see solicitor help otleyhour'

In [87]:
tweet.to_csv("train_clean.csv",index=False,sep="*",encoding="utf-8")
test.to_csv("test_clean.csv",index=False,sep="*",encoding="utf-8")