In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
tweets = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', index_col='id')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv', index_col='id')

# Checking out the training data
print(tweets.info())

# There are some missing keyword values which we should fill so we can use the column
tweets.keyword.fillna('', inplace=True)
test_df.keyword.fillna('', inplace=True)
print(tweets.info())

In [None]:
# The keywords column also has a string %20 to signify a space between keywords
print(tweets[tweets.keyword.str.contains('%')].keyword.value_counts())
# Can replace the string with a space
tweets['keyword'] = tweets.keyword.str.replace("%20", " ")
test_df['keyword'] = test_df.keyword.str.replace("%20", " ")
print('\n',tweets[tweets.keyword.str.contains('%')]) # Should show an empty dataframe now


In [None]:
# Taking a look at the location column
print(tweets.location.value_counts())

# Some of these entries could hold import information so the missing values should be filled with an empty string
tweets.location.fillna('', inplace=True)
test_df.location.fillna('', inplace=True)
print(tweets.info())

# Now that every feature column holds text information they can be concatenated together into one column

In [None]:
# Separating out the target column
y = tweets.target

# Checking out the split between real and fake disasters
print(y.value_counts())

# Making the features dataframe
X = tweets.drop(columns='target')
print(X.columns, X.shape)
X['text'] = X.keyword + " " + X.text + " " + X.location
X.drop(columns=['keyword', 'location'], inplace=True)

# Now the features dataframe only contains one column of text with the keyword emphasized at the start, and the location encluded at the end
# Repeating the process for the test data
test_df['text'] = test_df.keyword + " " + test_df.text + " " + test_df.location
test_df.drop(columns=['keyword', 'location'], inplace=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,2))
X_vect = vect.fit_transform(X['text'])
test_vect = vect.transform(test_df['text'])
# I tried including a polynomial feature interaction here to improve the model but my computer could not calculate it, even with a sparse matrix

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_vect, y)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, X_vect, y, cv=4)

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_vect, y)
cross_val_score(tree, X_vect, y, cv=4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_vect, y)
cross_val_score(knn, X_vect, y, cv=4)

In [None]:
# A logistic regression model seems to be the most effective here
# Performing a randomized search for good hyperparameters
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'C':np.linspace(0.1, 2, 5),
             'fit_intercept':[True, False]}
logreg_cv = RandomizedSearchCV(logreg, param_grid, cv=4)
logreg_cv.fit(X_vect, y)
logreg_cv.best_params_

In [None]:
logreg_best = LogisticRegression(C=0.1, max_iter=200, penalty='l2', solver='lbfgs')
logreg_best.fit(X_vect, y)
np.mean(cross_val_score(logreg_best, X_vect, y, cv=4))

In [None]:
preds = logreg_best.predict(test_vect)

submission = pd.DataFrame(index=test_df.index)
submission['target'] = preds

os.chdir(r'/kaggle/working')
submission.to_csv(r'DisasterTweets.csv')

from IPython.display import FileLink
FileLink(r'DisasterTweets.csv')