In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df_test = pd.read_csv('C:/Users/bourg/OneDrive/Documents/springboard/NLP getting started/cleaned/test_data_clean.csv')
df_train = pd.read_csv('C:/Users/bourg/OneDrive/Documents/springboard/NLP getting started/cleaned/train_data_clean.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target
0,31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
1,32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
2,33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
3,34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
4,35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


Some data cleaning has already been done, but more finalization is needed. 
1: the location column will be dropped. It is mostly empty and the data that is there would take enormous effort to hand sort to make useful what with the way it is categorized -- this wouldn't be feasible for the test data or deployment. It does not appear to be a critical column either.  
2: The ID column and "unnamed: 0" column will be dropped in both the training and the test sets. 

In [3]:
df_train = df_train.drop("id", axis=1)
df_train = df_train.drop("location", axis=1)

df_test = df_test.drop("id", axis=1)
df_test = df_test.drop("location", axis=1)



In [4]:
df_train = df_train.drop("Unnamed: 0", axis=1)
df_test = df_test.drop("Unnamed: 0", axis=1)


In [5]:
df_train.head()


Unnamed: 0,keyword,text,target
0,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
1,ablaze,We always try to bring the heavy. #metal #RT h...,0
2,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
3,ablaze,Crying out for more! Set me ablaze,0
4,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [6]:
#for c in chars:
#df_test['text_clean'] = df_test['text'].str.replace(c, '')

In [7]:
#df_train['text_clean'] = df_train['text'].str.replace(["`","_","{","}","[","]","(",")",">","+","-",".","?","=","%",":","<","/"], "")
#df_train['text_clean'] = df_train['text'].str.replace(r"\\`_{}\[\]\(\)>\+-,\.\!\?:=/%", '', regex=True)   
df_train['text_clean'] = df_train['text'].str.replace(r"[\\`_{}\[\]\(\)>\+-,\.\!\?:=/%]", ' ', regex=True)    
df_test['text_clean'] = df_test['text'].str.replace(r"[\\`_{}\[\]\(\)>\+-,\.\!\?:=/%]", ' ', regex=True)                                                   

In [8]:
df_train.head()

# Change to lower case


Unnamed: 0,keyword,text,target,text_clean
0,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,@bbcmtd Wholesale Markets ablaze http t co l...
1,ablaze,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy #metal #RT h...
2,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,#AFRICANBAZE Breaking news Nigeria flag set a...
3,ablaze,Crying out for more! Set me ablaze,0,Crying out for more Set me ablaze
4,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...


In [9]:
#df_train['text_clean'] = df_train['text_clean'].str.lower()
#df_test['text_clean'] = df_test['text_clean'].str.lower()

I chose not to make everything lowercase, because of the nuance of internet language and casing, particularly things like all caps words or partial capitalization. ("LOOK" vs "look" vs "LoOk"). This might be something to change later if I want to tweak the model. 

In [10]:
df_train.head()

Unnamed: 0,keyword,text,target,text_clean
0,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,@bbcmtd Wholesale Markets ablaze http t co l...
1,ablaze,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy #metal #RT h...
2,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,#AFRICANBAZE Breaking news Nigeria flag set a...
3,ablaze,Crying out for more! Set me ablaze,0,Crying out for more Set me ablaze
4,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...


# Vectorization


In [11]:
tv = TfidfVectorizer(max_features=100, stop_words='english')
tv.fit(df_train['text_clean'])
train_transformed = tv.transform(df_train['text_clean'])

In [12]:
train_transformed

<7552x100 sparse matrix of type '<class 'numpy.float64'>'
	with 13214 stored elements in Compressed Sparse Row format>

In [13]:
train_tv_df = pd.DataFrame(train_transformed.toarray(),columns=tv.get_feature_names()).add_prefix('TFIDF_')

df_train2 = pd.concat([df_train, train_tv_df], axis=1, sort=False)


In [14]:
df_train2.head()

Unnamed: 0,keyword,text,target,text_clean,TFIDF_11,TFIDF_2015,TFIDF_accident,TFIDF_amp,TFIDF_army,TFIDF_attack,...,TFIDF_water,TFIDF_way,TFIDF_wildfire,TFIDF_work,TFIDF_world,TFIDF_wreck,TFIDF_year,TFIDF_years,TFIDF_youtube,TFIDF_ûªs
0,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,@bbcmtd Wholesale Markets ablaze http t co l...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ablaze,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy #metal #RT h...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,#AFRICANBAZE Breaking news Nigeria flag set a...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ablaze,Crying out for more! Set me ablaze,0,Crying out for more Set me ablaze,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Dummy variables on the keyword column: 
    

In [15]:
one_hot_encoded = pd.get_dummies(df_train, columns=['keyword'], prefix='OH')

df_train2 = pd.merge(
    left=df_train2,
    right=one_hot_encoded,
    left_index=True,
    right_index=True,
)
df_train2.head()

Unnamed: 0,keyword,text_x,target_x,text_clean_x,TFIDF_11,TFIDF_2015,TFIDF_accident,TFIDF_amp,TFIDF_army,TFIDF_attack,...,OH_weapons,OH_whirlwind,OH_wild%20fires,OH_wildfire,OH_windstorm,OH_wounded,OH_wounds,OH_wreck,OH_wreckage,OH_wrecked
0,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,@bbcmtd Wholesale Markets ablaze http t co l...,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,ablaze,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy #metal #RT h...,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,#AFRICANBAZE Breaking news Nigeria flag set a...,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,ablaze,Crying out for more! Set me ablaze,0,Crying out for more Set me ablaze,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#df_train.to_csv(r'C:/Users/bourg/OneDrive/Documents/springboard/NLP getting started/cleaned/test_data_processed.csv')
#df_train2.to_csv(r'C:/Users/bourg/OneDrive/Documents/springboard/NLP getting started/cleaned/train_data_processed.csv')