In [22]:
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer as tf
from sklearn import svm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
df = pd.read_csv("tweet_train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
# We'll use this test set for cross validation
df = df[["id", "text", "target"]]
df = df.set_index("id")

In [7]:
df

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
10869,Two giant cranes holding a bridge collapse int...,1
10870,@aria_ahrary @TheTawniest The out of control w...,1
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,Police investigating after an e-bike collided ...,1


In [8]:
target = df.pop("target")
attributes = df
X_train, X_cv, y_train, y_cv = train_test_split(attributes,target)

In [9]:
X_train

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
10457,@SenFeinstein Thanks Sen. Feinstein now hurr...
1603,Shadowflame and the Wraith: Bombed http://t.co...
4812,FAAN orders evacuation of abandoned aircraft a...
4948,My head exploded i swear
7248,Alarming Rise in Dead Marine Life Since the #F...
...,...
2190,The Catastrophic Effects of Hiroshima and Naga...
1927,@Louis_Tomlinson incredible? THE CHILDREN WERE...
7086,@LeMaireLee @danharmon People Near Meltdown Co...
7723,Yet Brits are panicking about the UK http://t....


In [10]:
vectorizer = tf()
X_train_vec = vectorizer.fit_transform(X_train["text"])
X_cv_vec = vectorizer.transform(X_cv["text"]) 

In [11]:
X_train_vec

<5709x17632 sparse matrix of type '<class 'numpy.float64'>'
	with 83470 stored elements in Compressed Sparse Row format>

In [12]:
X_train

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
10457,@SenFeinstein Thanks Sen. Feinstein now hurr...
1603,Shadowflame and the Wraith: Bombed http://t.co...
4812,FAAN orders evacuation of abandoned aircraft a...
4948,My head exploded i swear
7248,Alarming Rise in Dead Marine Life Since the #F...
...,...
2190,The Catastrophic Effects of Hiroshima and Naga...
1927,@Louis_Tomlinson incredible? THE CHILDREN WERE...
7086,@LeMaireLee @danharmon People Near Meltdown Co...
7723,Yet Brits are panicking about the UK http://t....


In [25]:
linear_svm = svm.SVC(kernel = "linear")
parameters = [{"C": [1,10,100,1000], "kernel": ["linear"]}, {"C": [1,10,100,1000], "kernel": ["rbf"], "gamma":[.1,.2,.3,.4,.5,.6,.7,.8,.9]}]
grid_search = GridSearchCV(estimator = linear_svm, param_grid= parameters)
grid_search.fit(X_train_vec, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000],
                          'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                    0.9],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 1, 'kernel': 'linear'}
0.8038174336273677


In [74]:
linear_svm.score(X_cv_vec,y_cv)

0.8093487394957983

In [75]:
rbf_svm = svm.SVC(kernel = "rbf")
rbf_svm.fit(X_train_vec, y_train)

SVC()

In [76]:
rbf_svm.score(X_cv_vec,y_cv)

0.8135504201680672

In [77]:
sig_svm = svm.SVC(kernel = "sigmoid") 
sig_svm.fit(X_train_vec, y_train)

SVC(kernel='sigmoid')

In [78]:
sig_svm.score(X_cv_vec,y_cv)

0.8098739495798319

In [27]:
dftest = pd.read_csv("tweet_test.csv")
dftest.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [28]:
dfsub = pd.DataFrame()
dfsub["id"] = dftest["id"]
dfsub["target"] = grid_search.predict(vectorizer.transform(dftest["text"]))
dfsub

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [29]:
dfsub.to_csv("tweetsub.csv", index =False)