In [69]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [33]:
df_comments_train = pd.read_csv('./nlp-getting-started/train.csv')
df_comments_test = pd.read_csv('nlp-getting-started/test.csv')
sub = pd.read_csv('nlp-getting-started/sample_submission.csv')

In [34]:
print(f'Dimension of train set rows: {df_comments_train.shape}')
print(f'Dimension of test set rows: {df_comments_test.shape}')
print(f'Columns of train set rows: {df_comments_train.columns}')
print(f'Columns of test set rows: {df_comments_test.columns}')

Dimension of train set rows: (7613, 5)
Dimension of test set rows: (3263, 4)
Columns of train set rows: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Columns of test set rows: Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [35]:
df_comments_test.index

RangeIndex(start=0, stop=3263, step=1)

In [36]:
df_comments_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [37]:
def combine_df(train,test):
    y = train['target'].copy()
    train_id = train['id'].copy()
    test_id = test['id'].copy()
    df = train.drop(['target','id'],axis=1).append(test.drop(['id'],axis=1),ignore_index=True)
    return df,y,train_id,test_id

def split_df(all_df):
    train = all_df.iloc[:7613]
    test = all_df.iloc[7613:]
    return train,test

In [38]:
combined_data,train_target,train_id,test_id = combine_df(df_comments_train,df_comments_test)
combined_data

Unnamed: 0,keyword,location,text
0,,,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...
10871,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872,,,Storm in RI worse than last hurricane. My city...
10873,,,Green Line derailment in Chicago http://t.co/U...
10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [39]:
combined_data.columns

Index(['keyword', 'location', 'text'], dtype='object')

In [40]:
lemmatizer = WordNetLemmatizer()


In [50]:
for index,value in combined_data.iterrows():
    reviews = re.sub('[^a-zA-Z]',' ',combined_data.at[index,'text'])
    lower = reviews.lower()
    words = lower.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    combined_data.at[index,'text'] = ' '.join(words)


In [51]:
combined_data.to_csv('yo.csv')

In [53]:
df_train,df_test = split_df(combined_data)
df_train

Unnamed: 0,keyword,location,text
0,,,deed reason earthquake may allah forgive u
1,,,forest fire near la ronge sask canada
2,,,resident asked shelter place notified officer ...
3,,,people receive wildfire evacuation order calif...
4,,,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...
7608,,,two giant crane holding bridge collapse nearby...
7609,,,aria ahrary thetawniest control wild fire cali...
7610,,,utc km volcano hawaii http co zdtoyd ebj
7611,,,police investigating e bike collided car littl...


In [52]:
df_test

Unnamed: 0,keyword,location,text
7613,,,happened terrible car crash
7614,,,heard earthquake different city stay safe ever...
7615,,,forest fire spot pond goose fleeing across str...
7616,,,apocalypse lighting spokane wildfire
7617,,,typhoon soudelor kill china taiwan
...,...,...,...
10871,,,earthquake safety los angeles safety fastener ...
10872,,,storm ri worse last hurricane city amp others ...
10873,,,green line derailment chicago http co utbxlcbiuy
10874,,,meg issue hazardous weather outlook hwo http c...


In [65]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=0)


In [64]:
X_hasChar = df_train.text.tolist()
y = train_target.tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_hasChar).toarray()
X.shape




(7613, 20661)

In [74]:
X_test_hasChar = combined_data.text.tolist()
X_final = vectorizer.fit_transform(X_test_hasChar).toarray()
type(X_final)

numpy.ndarray

In [75]:
X_final_train = X_final[:7613]
X_final_test = X_final[7613:]

In [68]:
fake_twitter_model = MultinomialNB().fit(X_train,y_train)
y_pred = fake_twitter_model.predict(X_test)

In [70]:
accuracy_score(y_test,y_pred)

0.8074015121368882

In [76]:
fake_twitter_model = MultinomialNB().fit(X_final_train,y)
y_pred = fake_twitter_model.predict(X_final_test)


In [77]:
y_pred.shape

(3263,)

In [80]:
type(test_id)

pandas.core.series.Series

In [79]:
df_pred = pd.DataFrame(data=y_pred,columns=['target'])
df_pred

Unnamed: 0,target
0,1
1,0
2,1
3,1
4,1
...,...
3258,1
3259,0
3260,1
3261,1


In [81]:
df_id = test_id.to_frame()
df_id

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
...,...
3258,10861
3259,10865
3260,10868
3261,10874


In [83]:
df_id['target'] = df_pred['target'].to_numpy()
df_id

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,0
3260,10868,1
3261,10874,1


In [84]:
df_id.to_csv('submission.csv')