## NLP Disaster Tweets Classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print(train_df.shape,' ',test_df.shape)

### A quick look at our data

In [None]:
train_df.head(30)

#### Tweets keywords ranking:

In [None]:
train_df['keyword'].value_counts().plot.barh(figsize=(15, 40), color='green')

#### Proportion of disaster tweets (target = 1):

In [None]:
ax = train_df['target'].value_counts().plot.bar()
ax.set_ylabel('Counts', size=12)
ax.set_xlabel('Target', size=12)

#### Show some special characters in the text, example of the links:

In [None]:
train_df["text"][train_df["text"].str.find("http")>0]

### Remove extra spaces, special characters, emojis, and links from text:

In [None]:
import re

icount = 0
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub("[^а-яёйa-z0-9]", " ", text)
    text = re.sub("\s+", " ", text)
    return text


#remove links, extra spaces and special characters:
train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)

#train_df["text"].head(20)

### Prepare test, train dfs

In [None]:
X_train = train_df["text"]
Y_train = train_df["target"]
X_test = test_df["text"]

### Modelling with Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
text_clf_mnb = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB())])

#### Cross Validation, Multinomial Naive Bayes

In [None]:
scores = model_selection.cross_validate(text_clf_mnb, X_train, Y_train, cv=3,scoring=('f1', 'accuracy'),return_train_score=True)
print("Multinomial Naive Bayes:")
print("F1 scores: ", scores['test_f1'])
print("Accuracy scores: ", scores['test_accuracy'])

Mildly efficient...now try logistic regression:

### Modelling with LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

text_clf_lg = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression())])

#### Cross Validation, Logistic Regression

In [None]:
scores = model_selection.cross_validate(text_clf_lg, X_train, Y_train, cv=3,scoring=('f1', 'accuracy'),return_train_score=True)
print("Logistic Regression:")
print("F1 scores: ", scores['test_f1'])
print("Accuracy scores: ", scores['test_accuracy'])

Similar performance, try now linear SVM:

### Modelling with SGD Classifier

In [None]:
text_clf_sgd = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier())])


#### Cross Validation, SGD Classifier

In [None]:
scores = model_selection.cross_validate(text_clf_sgd, X_train, Y_train, cv=3,scoring=('f1', 'accuracy'),return_train_score=True)
print("SGD Classifier:")
print("F1 scores: ", scores['test_f1'])
print("Accuracy scores: ", scores['test_accuracy'])

### Hyperparameters optimization: trying improvements using GridSearch. Tried f1, accuracy and roc_auc as figure of merit. Kept roc_auc as a compromise

In [None]:
#Tried scoring = "accuracy", "f1","roc_auc"
from sklearn.model_selection import GridSearchCV
parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (10,1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6),
     'clf__fit_prior' : (True, False)
}
gs_clf = GridSearchCV(text_clf_mnb, parameters, cv=5, scoring="roc_auc" , n_jobs=-1)

gs_clf = gs_clf.fit(X_train, Y_train)
print("GridSearch results for MultiNomial Naive Bayes: ")
print('best score: ',gs_clf.best_score_)
print('best parameters: ',gs_clf.best_params_)
    

In [None]:
learning_rate = ['optimal']
eta0 = np.logspace(0, 4, 10)
parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (10,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7),
     'clf__learning_rate': learning_rate
}

gs_clf = GridSearchCV(text_clf_sgd, parameters, cv=5, scoring="roc_auc",n_jobs=-1)

gs_clf = gs_clf.fit(X_train, Y_train)
print("GridSearch results for SGD Classifier: ")
print('best score: ',gs_clf.best_score_)
print('best parameters: ',gs_clf.best_params_)

GridSearchCV for LogisticRegression:

In [None]:
C = np.logspace(-4, 4, 50)
#penalty = ['l1', 'l2']
#solver = ['liblinear']
penalty = ['l2']
solver =['lbfgs','newton-cg']
penalty_and_solver = [(['l1', 'l2'],'liblinear'),('l2','lbfgs'),('l2','newton-cg')]

parameters = dict(vect__ngram_range = [(1, 1), (1, 2)],
                  tfidf__use_idf = (True, False),    
                  clf__C=C,
                  clf__penalty=penalty,
                  clf__solver=solver)

gs_clf = GridSearchCV(text_clf_lg, parameters, cv=5, scoring="roc_auc" , n_jobs=-1)

gs_clf = gs_clf.fit(X_train, Y_train)
print("GridSearch results for Logistic Regression: ")
print('best score: ',gs_clf.best_score_)
print('best parameters: ',gs_clf.best_params_)

### Logistic Regression optimal parameters perform slightly better in terms of best roc_auc. Go and fit with those parameters:

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2))),
     ('tfidf', TfidfTransformer(use_idf = True)),
     ('clf', LogisticRegression(C=0.8286,penalty='l2',solver='liblinear'))])
text_clf.fit(X_train,Y_train)

In [None]:
predictions = text_clf.predict(X_test)
print(predictions[:10])

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
sample_submission["target"] = predictions

In [None]:
sample_submission.head(25)

In [None]:
sample_submission.to_csv("submission.csv", index=False)

Now, in the viewer, you can submit the above file to the competition! Good luck!