In [1]:
# e.g. if using google colab import drive, uncomment lines below
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import packages

import os
import sklearn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression as sk_OLS
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
import math

# Part (a): Download the data

In [3]:
#====================================================#
# YOUR CODE HERE:
#   Import train and test csv files.
#   You should use the pd.read_csv function.
#   You should set the index_col parameter to equal 'id'.
#====================================================#

train_data = pd.read_csv('train.csv', index_col='id')
test_data  = pd.read_csv('test.csv', index_col='id')

# Count the number of data points
num_train_data_points = len(train_data)
num_test_data_points = len(test_data)

print(f"Number of training data points: {num_train_data_points}")
print(f"Number of test data points: {num_test_data_points}")

# Calculate the percentage of tweets that are about real disasters
percentage_real_disasters = (train_data['target'].sum() / num_train_data_points) * 100
percentage_not_real_disasters = 100 - percentage_real_disasters

print(f"Percentage of tweets about real disasters: {percentage_real_disasters:.2f}%")
print(f"Percentage of tweets not about real disasters: {percentage_not_real_disasters:.2f}%")

#====================================================#
# END YOUR CODE
#====================================================#


Number of training data points: 7613
Number of test data points: 3263
Percentage of tweets about real disasters: 42.97%
Percentage of tweets not about real disasters: 57.03%


In [4]:
#====================================================#
# YOUR CODE HERE:
#   Get the index values for X_train and y_train.
#   Get the data values for X_train and y_train.
#   Get the index values for X_test.
#   Get the index values for y_test.
#====================================================#

# get train indices
X_train_id = train_data.index
y_train_id = train_data['target']
# get train data
X_train    = train_data.drop(columns=['target'])  # Remove the 'target' column
y_train    = train_data['target']

# get test indices
X_test_id  = test_data.index
# get test data
X_test     = test_data

#====================================================#
# END YOUR CODE
#====================================================#

print(f"Train Data Shape: {X_train.shape}")
print(f"Test Data Shape: {X_test.shape}")

print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")

Train Data Shape: (7613, 3)
Test Data Shape: (3263, 3)
Number of labels = 1 in train dataset as percentage: 42.97%
Number of labels = 0 in train dataset as percentage: 57.03%


### Part (a), Question 1: How many training and test data points are there?

### Answer: There are 7613 data points for training and 3263 data points for test.

### Part (a), Question 2: what percentage of the training tweets are of real disasters, and what percentage is not?

### Answer: The percentage for real disasters is 42.97%, and the percentage for not-real disasters is 57.03%

# Part (b): Split the training data.

In [5]:
#====================================================#
# YOUR CODE HERE:
#  You should use the sklearn.model_selection.train_test_split
#     parameter to perform the train/development split
#   Set the test_size to 0.30.
#   Set the random_stat parameter to 42.
#====================================================#

X_train_orig   = train_test_split(
    X_train, y_train, test_size=0.30, random_state=42)[0]
X_develop_orig = train_test_split(
    X_train, y_train, test_size=0.30, random_state=42)[1]
y_train_orig   = train_test_split(
    X_train, y_train, test_size=0.30, random_state=42)[2]
y_develop_orig = train_test_split(
    X_train, y_train, test_size=0.30, random_state=42)[3]

#====================================================#
# END YOUR CODE
#====================================================#

# Part (c): Preprocess the data.

In [6]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function to obtain the pre-processed
#  X_train and X_develop
#  Note that we suggest you to do every sub-question in a dedicated Python
#  function to make the code more structured and less error-prone.
#  With a function, you can clearly test each part when you encounter an error.
#  You can also create your own simple input data (e.g. just one sample) to
#  test the correctness of a function.
#========================================================================#

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

def pre_process(text):
    # Convert text to lowercase
    pre_processed = text.lower()
    
    # Lemmatize words
    lemmatize = WordNetLemmatizer()
    pre_processed = " ".join([lemmatize.lemmatize(token) for token in pre_processed.split(" ")])
    
    # Remove punctuation and @
    pre_processed = " ".join([word for word in pre_processed.split() if word.isalpha()])
    
    # Remove URLs
    pre_processed = re.sub(r"http\S+|www\S+|https\S+", '', pre_processed, flags=re.MULTILINE)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    pre_processed = " ".join([word for word in pre_processed.split() if not word in stop_words])
    
    #========================================================================#
    #  This function should return the pre-processed data
    #========================================================================#
    return pre_processed # Feel free to change the variable name

# Apply the pre_process_text function to the text columns of DataFrames
X_train_preproc = X_train_orig.copy()
X_train_preproc['text'] = X_train_orig['text'].apply(pre_process)

X_develop_preproc = X_develop_orig.copy()
X_develop_preproc['text'] = X_develop_orig['text'].apply(pre_process)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewpark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1. I first converted the texts into lower case. This can make the data treatment simple afterwards.

2. Then I lemmatized all the lowercase words.

3. Removing punctuation and other symbols (e.g. @) by using '.isalpha()' method.

4. Lastly, I removed URLs and stopwords to simplify the data.

# Part (d): Bag of words model.

In [7]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function to obtain X_train and X_develop,
#  whose "text" feature only contains 1 and 0 to indicate whether a word is in
#  the tweet. At this point, you should only be constructing feature vectors
#  for each data point using the text in the “text” column.
#  You should ignore the “keyword” and “location” columns for now.
#========================================================================#
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = None

def bag_of_word(data, min_df=1):
    global vectorizer
    # Initialize the CountVectorizer with binary=True
    
    M = 4
    
    if vectorizer is None:
        vectorizer = CountVectorizer(binary=True, min_df=M)
        # Fit the vectorizer on the training data and transform both training and development data
        featurized_data = vectorizer.fit_transform(data["text"]).toarray()
    else:
        featurized_data = vectorizer.transform(data['text'])

    #========================================================================#
    #  This function should return the new data whose "text" feature contains
    #  only 0 and 1
    #========================================================================#

    return featurized_data # Feel free to change the variable name

# get the featurized data
X_train   = bag_of_word(X_train_preproc)
X_develop = bag_of_word(X_develop_preproc)



Set my M to 4 so that the words have more weights. Normally, setting M to 5 is preferable, but since I wanted a high performance, I lowered the M to 4.

# Part (e): Logistic regression.

In [8]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  without regularization terms.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#

from sklearn.linear_model import LogisticRegression

def logistic_without_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    reg = LogisticRegression(penalty='none', max_iter=1000)

    # then fit your model to the train data
    reg.fit(X_train, Y_train)

    # then generate your prediction for the training set
    y_train_no_reg = reg.predict(X_train)

    # then generate your prediction for the development set
    y_develop_no_reg = reg.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_no_reg, y_develop_no_reg

y_train_no_reg, y_develop_no_reg = logistic_without_regularization(X_train, y_train_orig, X_develop, y_develop_orig)

# get the F1 train and develop scores
F1_train_no_reg = sklearn.metrics.f1_score(y_train_orig, y_train_no_reg)
F1_develop_no_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_no_reg)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_no_reg:.2f}")
print(f"F1 for development set: {F1_develop_no_reg:.2f}")



F1 for training set: 0.93
F1 for development set: 0.65


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The significant gap between the training set's high F1 score and the lower F1 score of the development set indicates overfitting, where the model has memorized the training data but struggles to generalize. To mitigate overfitting, consider gathering more diverse data, applying regularization techniques like L1 or L2, and fine-tuning hyperparameters through cross-validation. Additionally, simplifying the model architecture can make it less prone to overfitting, leading to improved generalization performance.

In [9]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L1 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#
def logistic_L1_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    reg = LogisticRegression(penalty = 'l1', solver='liblinear')

    # then fit your model to the train data
    reg.fit(X_train, Y_train)

    # then generate your prediction for the training set
    y_train_L1_reg = reg.predict(X_train)

    # then generate your prediction for the development set
    y_develop_L1_reg = reg.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L1_reg, y_develop_L1_reg

y_train_L1_reg, y_develop_L1_reg = logistic_L1_regularization(X_train, y_train_orig, X_develop, y_develop_orig)

# get the F1 train and develop scores
F1_train_L1_reg = sklearn.metrics.f1_score(y_train_orig, y_train_L1_reg)
F1_develop_L1_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_L1_reg)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_L1_reg:.2f}")
print(f"F1 for development set: {F1_develop_L1_reg:.2f}")

F1 for training set: 0.81
F1 for development set: 0.73


In [10]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L2 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#
def logistic_L2_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    reg = LogisticRegression(penalty='l2')

    # then fit your model to the train data
    reg.fit(X_train, Y_train)

    # then generate your prediction for the training set
    y_train_L2_reg = reg.predict(X_train)

    # then generate your prediction for the development set
    y_develop_L2_reg = reg.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L2_reg, y_develop_L2_reg

y_train_L2_reg, y_develop_L2_reg = logistic_L2_regularization(X_train, y_train_orig, X_develop, y_develop_orig)

# get the F1 train and develop scores
F1_train_L2_reg = sklearn.metrics.f1_score(y_train_orig, y_train_L2_reg)
F1_develop_L2_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_L2_reg)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_L2_reg:.2f}")
print(f"F1 for development set: {F1_develop_L2_reg:.2f}")

F1 for training set: 0.84
F1 for development set: 0.73


### Which one of the three classifiers performed the best on your training and development set? Did you observe any overfitting and did regularization help reduce it? Support your answers with the classifier performance you got.

### Answer:
Without Regulization = \
    F1 for training set: 0.93 \
    F1 for development set: 0.65

L1 Regularization = \
    F1 for training set: 0.81 \
    F1 for development set: 0.73

L2 Regularization = \
    F1 for training set: 0.84 \
    F1 for development set: 0.73

The regularization method that yielded the lowest difference between F1 scores for the training set and development set, indicating better generalization to new data, was L1 regularization with an F1 score of 0.81 for the training set and 0.73 for the development set. This suggests that L1 regularization helps the model achieve more consistent performance on both the training and development data, making it a promising choice for handling new, unseen data.

### Inspect the weight vector of the classifier with L1 regularization (in other words, look at the θ you got after training). You can access the weight vector of the trained model using the coef_attribute of a LogisticRegression instance. What are the most important words for deciding whether a tweet is about a real disaster or not? You might need to run some code (feel free to insert a code cell below).

In [11]:
N = 5  # 5 most important words

inspect_L1 = LogisticRegression(penalty='l1', solver='liblinear')
inspect_L1.fit(X_train, y_train_orig)

feature_names = vectorizer.get_feature_names_out()
weights = inspect_L1.coef_[0]

# Get the indices of the top N features with the highest absolute weights
top_feature_indices = np.argsort(-np.abs(weights))[:N]

for i in top_feature_indices:
    print(f"Feature: {feature_names[i]}, Weight: {weights[i]}")

Feature: spill, Weight: 4.049948694779842
Feature: outbreak, Weight: 3.918085900644332
Feature: airport, Weight: 3.6959135655508537
Feature: wreckage, Weight: 3.5905813287381183
Feature: debris, Weight: 3.1986892649544223


### Answer:

The five most important words are 'spill', 'outbreak', 'airport', 'wreckage', and 'debris'.

# Part (f): Bernoulli Naive Bayes.

In [12]:
class BernoulliNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        #====================================================#
        # YOUR CODE HERE:
        #  You should build the Bernoully NB model from scratch
        #  Do not use sklearn, use numpy and other basic packages
        #    only.
        #  Please update and save the parameters
        #    "self.class_log_prior_" and "self.feature_prob_"
        #  These variables are just a suggestion to help
        #    structure your code - you do not need to use them
        #    if you would prefer not to
        #====================================================#
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        smoothing = 2 * self.alpha
        n_doc = np.array([len(i) + smoothing for i in separated])

        self.feature_log_prob_ = np.log(count / n_doc[np.newaxis].T)
        self.feature_log_prob_neg_ = np.log(1 / (1 + np.exp(self.feature_log_prob_)))
        #====================================================#
        # END YOUR CODE
        #====================================================#
        return self

    def predict_log_prob(self, X):
        #====================================================#
        # YOUR CODE HERE:
        #  You should build the Bernoully NB model from scratch
        #  Do not use sklearn, use numpy and other basic packages
        #    only.
        #  Please update and save the parameters
        #    "self.pred_log_prob_" and "y_pred"
        #  These variables are just a suggestion to help
        #    structure your code - you do not need to use them
        #    if you would prefer not to
        #====================================================#
        positive_prob = X.dot(self.feature_log_prob_.T) 
        ones_matrix = np.ones(X.shape)
        negative_prob = (ones_matrix - X).dot(self.feature_log_prob_neg_.T)

        return positive_prob + negative_prob + np.array(self.class_log_prior_)

    def predict(self, X):
        return np.argmax(self.predict_log_prob(X), axis=1)
    
        #====================================================#
        # END YOUR CODE
        #====================================================#
        return y_pred

# get the predictions y_train_NB and y_develop_NB
nb = BernoulliNB(alpha=1)
nb.fit(X_train, y_train_orig)
y_train_NB = nb.predict(X_train) # prediction from X_train using model
y_develop_NB = nb.predict(X_develop) # prediction from X_develop using model

# Convert predictions to NumPy arrays
y_train_NB = np.asarray(y_train_NB)
y_develop_NB = np.asarray(y_develop_NB)

# get the F1 train and develop scores
F1_train_NB = sklearn.metrics.f1_score(y_train_orig, y_train_NB)
F1_develop_NB = sklearn.metrics.f1_score(y_develop_orig, y_develop_NB)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_NB:.2f}")
print(f"F1 for development set: {F1_develop_NB:.2f}")

F1 for training set: 0.78
F1 for development set: 0.72


# Part (g): Model comparison.

Question: Which model performed the best in predicting whether a tweet is of a real disaster or not? Include your performance metric in your response. Comment on the pros and cons of using generative vs discriminative models.

Answer:

Logistic Regression with L1 regularization (Discriminative Model)\
F1 for training set : 0.81\
F1 for development set : 0.73

Bernoulli Naive Bayes (Generative Model)\
F1 for training set : 0.78\
F1 for development set : 0.72

In my case, the Logistic Regression model outperformed the Bernoulli Naive Bayes model, which is not surprising given that Logistic Regression is known for its discriminative modeling capabilities and often works well for binary classification tasks. However, generative models can be valuable in certain scenarios, especially when dealing with limited data or when the independence assumptions align with the data's true structure.


Question: hink about the assumptions that Naive Bayes makes. How are the assumptions different from logistic regressions? Discuss whether it is valid and efficient to use Bernoulli Naive Bayes classifier for natural language texts.

Answer: In text classification for natural language processing, Bernoulli Naive Bayes leverages binary feature values, aligning well with the presence or absence of words in a document. Its simplicity and computational efficiency make it suitable for baseline models and resource-constrained scenarios. However, its significant limitation lies in the assumption of independence between words, which often doesn't hold in real-world texts due to complex word dependencies that contribute to overall meanings.

In contrast, Logistic Regression doesn't explicitly assume feature independence, offering greater flexibility in capturing intricate relationships between words and their impact on the class label. This makes it more suitable for tasks where word dependencies matter, although it may require more data and computational resources.

# Part (h): N-gram model.

In [13]:
#=======================================================================+#
# YOUR CODE HERE:
#  Featurized the preprocessed data: X_train_preproc and X_develop_preproc
#  using the N=2 gram model
#========================================================================#

vectorizer = None

def n_gram(data):
    global vectorizer
    M = 4
    if vectorizer is None:
        vectorizer = CountVectorizer(ngram_range=(2,2), binary=True, min_df=M)
        n_gram_data = vectorizer.fit_transform(data["text"]).toarray()
    else:
        n_gram_data = vectorizer.transform(data['text']).toarray()

    #========================================================================#
    # END CODE HERE
    #  This function should return the new data whose "text" feature contains
    #  only 0 and 1
    #========================================================================#
    return n_gram_data # Feel free to change the variable name

# get the featurized data
X_train_gram   = n_gram(X_train_preproc)
X_develop_gram = n_gram(X_develop_preproc)

#=======================================================================+#
# YOUR CODE HERE:
#  Use the functions you already defined "X_train_gram" and "X_develop_gram"
#  to re-run:
#  Logistic Regression with no regularization Model
#  Logistic Regression with L1 regularization Model
#  Logistic Regression with L2 regularization Model
#========================================================================#
y_train_gram_no_reg, y_develop_gram_no_reg = logistic_without_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)
y_train_gram_L1_reg, y_develop_gram_L1_reg = logistic_L1_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)
y_train_gram_L2_reg, y_develop_gram_L2_reg = logistic_L2_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)
nb.fit(X_train_gram, y_train_orig)
y_train_gram_NB = nb.predict(X_train_gram)
y_develop_gram_NB = nb.predict(X_develop_gram)

y_develop_gram_NB = np.asarray(y_develop_gram_NB)
#========================================================================#
# END CODE HERE
#========================================================================#

# get the F1 train and develop scores for no regularization model
F1_train_gram_no_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_no_reg)
F1_develop_gram_no_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_no_reg)
# get the F1 train and develop scores for L1 regularization model
F1_train_gram_L1_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_L1_reg)
F1_develop_gram_L1_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_L1_reg)
# get the F1 train and develop scores for L2 regularization model
F1_train_gram_L2_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_L2_reg)
F1_develop_gram_L2_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_L2_reg)
# get the F1 train and develop scores for Bernoulli NB model
F1_train_gram_NB = sklearn.metrics.f1_score(y_train_orig, y_train_gram_NB)
F1_develop_gram_NB = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_NB)

# print the F1 train and develop scores for no regularization model
print(f"F1 for training set: {F1_train_gram_NB:.2f}")
print(f"F1 for development set: {F1_develop_gram_NB:.2f}")
# print the F1 train and develop scores for L1 regularization model
print(f"F1 for training set: {F1_train_gram_L1_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L1_reg:.2f}")
# print the F1 train and develop scores for L2 regularization model
print(f"F1 for training set: {F1_train_gram_L2_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L2_reg:.2f}")
# print the F1 train and develop scores for Bernoulli NB model
print(f"F1 for training set: {F1_train_gram_NB:.2f}")
print(f"F1 for development set: {F1_develop_gram_NB:.2f}")



F1 for training set: 0.51
F1 for development set: 0.42
F1 for training set: 0.51
F1 for development set: 0.41
F1 for training set: 0.54
F1 for development set: 0.44
F1 for training set: 0.51
F1 for development set: 0.42


In [14]:
# Get the vocabulary
vocab = vectorizer.get_feature_names_out()

# Calculate the total number of 2-grams in the vocabulary
num_2grams = len(vocab)

# Get the first 5 2-grams from the vocabulary
vocab_5 = vocab[:5]

# Print the results
print(f"Total number of 2-grams in vocabulary: {num_2grams}")
print(f"First 5 2-grams from vocabulary: {', '.join(vocab_5)}")

Total number of 2-grams in vocabulary: 708
First 5 2-grams from vocabulary: aba woman, abandoned aircraft, abc news, access secret, accident expert


When determining the appropriate threshold value, M, for a model, it is a standard practice to consider the frequency of terms or n-grams. Striking the right balance between retaining rare n-grams and filtering out less relevant 2-grams is crucial. In this context, I decided to set M at 4. This choice is often guided by a widely accepted heuristic, which posits that a word should appear in multiple tweets to be considered significant.

Overall, the bag of words outperformed every single models compared to n gram function both training and development set. The reason for this might be the way n gram model captures texts in a 2-grams form. This might be good at capturing local context betweed words, but in this case, since tweets are short, it seems to be not that efficient to carry more weight on each words.

# Part (i): Determine performance with the test set.

In [15]:
#=======================================================================+#
# YOUR CODE HERE:
#  Re-build your feature vectors on the entire Kaggle train set
#  (i.e. DO NOT split the train set into a further train set and development set)
#========================================================================#

train_data = pd.read_csv('train.csv')
test_data  = pd.read_csv('test.csv')

# get train data
X_train    = train_data.drop(columns= 'target')
y_train    = train_data['target']

# get test data
X_test     = test_data

# Preprocess
X_train['text']   = X_train['text'].apply(pre_process)
X_test['text'] = X_test['text'].apply(pre_process)

#========================================================================#
# END CODE HERE
#========================================================================#

In [16]:
#=======================================================================+#
# YOUR CODE HERE:
#  Re-train your preferred classifier (see below) on the entire train set
#  (i.e. DO NOT split the train set into a further train set and development set)
#  Your preferred classifier may inculde either bag of word or n-gram,
#  and using either logistic regression or Bernoulli naive bayes
#========================================================================#

# Bag of Words 
X_train_text = bag_of_word(X_train)
X_test_text = bag_of_word(X_test)

# Fitting L1 regulization 
reg = LogisticRegression(penalty='l1', solver='liblinear')
reg.fit(X_train_text, y_train)

# Prediction on test set
y_test_L1_reg = reg.predict(X_test_text)

#========================================================================#
# END CODE HERE
#========================================================================#

In [17]:
#=======================================================================+#
# YOUR CODE HERE:
#  Report the resulting F 1-score on the test data, as reported by Kaggle
#========================================================================#

pred_df = pd.DataFrame(y_test_L1_reg, columns=['target']) 
to_submit = pd.concat([X_test['id'], pred_df], axis=1)
to_submit['id'] = to_submit['id'].astype(int)

to_submit.to_csv('results.csv', index=False)

#========================================================================#
# END CODE HERE
#========================================================================#

![Results](kaggle.png)

For the kaggle result, the score was 0.692. Considering the fact that I have chosen the best models for my prediction (bag of words, L1 regulization), I expected a higher score. There might be several reasons for this result, and one of them is maybe some randomness has been included in the model it self.