In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
warnings.filterwarnings("ignore")

data = pd.read_csv("tweets_train.csv")
test = pd.read_csv("tweets_test.csv")

data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [33]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [34]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [35]:
data.shape

(7613, 5)

In [36]:
from nltk.stem.porter import *
symbols = "!/$%&()*+-.#/:;<=>?@[\]^_'{|}~\n"

tokens = [['this','is','an','example',"'s"],['the','dog','ate',"don't",'do']]

stemmer = PorterStemmer()
word = stemmer.stem("notified")

print(word)

notifi


In [37]:
#make all words lowercase and remove all stop words and punctuation

from nltk.stem.porter import *
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))
symbols = "!/$%&()*+-.#/:;<=>?@[\]^_'{|}~\n"

for num in range(7613):
    tokens = word_tokenize(data['text'][num])
    word_tokens = np.char.lower(tokens)
    for i in symbols:
        word_tokens = np.char.replace(word_tokens,i," ")
    
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    new_tweet = ""
    for i in filtered_sentence:
        new_tweet += i
        new_tweet += " "
    data['text'][num] = new_tweet

data['text'].head()

0      deeds reason   earthquake may allah forgive us 
1             forest fire near la ronge sask   canada 
2    residents asked  shelter place   notified offi...
3    13,000 people receive   wildfires evacuation o...
4    got sent photo ruby   alaska smoke   wildfires...
Name: text, dtype: object

In [38]:
test.head(20)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [39]:
def split(data, y, length, split_mark = 0.7):
    if split_mark > 0 and split_mark < 1.0:
        n = int(split_mark * length)
    else:
        n = int(split_mark)
    X_train = data[:n].copy()
    X_test = data[n:].copy()
    y_train = y[:n].copy()
    y_test = y[n:].copy()
    return X_train, X_test, y_train, y_test

In [40]:
vectorizer = CountVectorizer(ngram_range = (1,3))

In [41]:
X_train, X_test, y_train, y_test = split(data.text, data.target, len(data))
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5329,) (2284,) (5329,) (2284,)


In [42]:
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [3067 2262]
Samples per class: [1275 1009]


In [43]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [44]:
feature_names = vectorizer.get_feature_names()
print("Number of features:{}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 19500 to 19530:\n{}".format(feature_names[19500:19530]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))


Number of features:99066
First 20 features:
['00', '00 11', '00 11 16', '00 11 utc', '00 18', '00 18 00', '00 52', '00 52 25', '00 at', '00 at http', '00 bestseller', '00 bestseller http', '00 ep', '00 ep http', '00 hiroshima', '00 hiroshima http', '00 http', '00 http co', '00 nnw', '00 nnw hana']
Features 19500 to 19530:
['co drbcrypj4p', 'co drculiyp0t', 'co drf3mmrbyx', 'co drfkarlz1d', 'co drlkebeypi', 'co drno7okm21', 'co ds76lozlsu', 'co dsb3ldfuxu', 'co dspws6hj8w', 'co dthneezupe', 'co dthneezupe pic', 'co dunmvj7itl', 'co dunmvj7itl course', 'co dutyzqr2p7åê', 'co dutyzqr2p7åê avoid', 'co duvuzhmvut', 'co dv4mmlso1i', 'co dvonwiv3l1', 'co dwbc1duvdk', 'co dxfqou4kt2', 'co dxirntdsrd', 'co dxkt2shuj2', 'co dxvtgi1bvo', 'co dxwfx56pwh', 'co dy1ersdcrh', 'co dy1ersdcrh http', 'co dydfvz7amj', 'co dydfvz7amj via', 'co dywwnbbyvj', 'co dywwnbbyvj days']
Every 2000th feature:
['00', '3f7owdecy7', 'achievement unlocked replaced', 'amp cliff', 'ass thought', 'battle hits', 'blog appre

In [45]:
vocab = vectorizer.vocabulary_
dict(list(vocab.items())[::4000])

{'deeds': 27417,
 'shall annihilated petebests': 80294,
 'co kxplyom9rr govegan': 20528,
 'thelonevirologi mackayim major': 87277,
 'love cotton': 58353,
 'katt katterpì instagram': 53477,
 'co 6hkw5qlppt': 18770,
 'casualty marketwatch http': 16066,
 'portable': 71475,
 'bestfriends high': 10836,
 'attractive man': 8383,
 'demolish food': 27799,
 'rape kill destroy': 73947,
 'mountaineering': 62885,
 'twice suffice time': 90586,
 'runner joy': 77597,
 'wildfire http co': 95608,
 'she tasted like': 80417,
 'saying hi fire': 78601,
 'dying brain floods': 31235,
 '530': 2470,
 'libya remain': 56264,
 'inundated articles': 51481,
 'massacre sinjar world': 60057,
 'grew 2013': 42097}

In [46]:
i = 10000
j = 100
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(X_train[j:j+7,i:i+10].todense(), columns=words)

Unnamed: 0,battle hits,battle hits rbi,battle internal,battle internal vs,battle occurred,battle occurred star,battle rapper,battle ripped,battle ripped hole,battle season
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [47]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.66


In [48]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.994
Test set score: 0.734


In [49]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [50]:
predictions_logreg = []
for i in test["text"]:
    pred1 = logreg.predict(vectorizer.transform([i])[0])
    predictions_logreg+=[pred1[0]]
print(predictions_logreg[:50])

[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]


In [51]:
final_df = pd.DataFrame(index = None)
final_df["id"]=test["id"]
final_df["target"]=predictions_logreg

print(len(predictions_logreg))

3263


In [52]:
submission = final_df.to_csv('predictions', index=None)

In [53]:
param_grid = {"C":[.001,.01,.1,1,10]}
grid = GridSearchCV(estimator = LogisticRegression(), param_grid = param_grid, cv=10)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ",grid.best_params_)

Best cross-validation score: 0.64
Best parameters:  {'C': 0.1}
