<a href="https://colab.research.google.com/github/aaldayarova/twitter-disasters/blob/main/disaster_nlp_hw2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nlp-getting-started:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F17777%2F869809%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241006%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241006T164231Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D56ea77358e875e11d6d83ed2e317f6b4a03b94961dde60084b50153e9c38c77b9815d38130a725783e09f6c7475d26e03c3a4c82a4f526acc40e4ab49c86fb9c66978f7b8dfa231fca742a1d5f797a669c3d7db034ec16d552dab69f315b114ac8260096e4528a003c66cbc71ea44e6691349c549eff30e101650a74fc674be8458acdb8e97a0dc9729653f0707f81d7a73d588f90be4448853ea7f458915b2c9c5aa9079eb3e42dfee96d851423f9d4bc1763392891cb787156a2f76693bb543c531ac2554d65a8d011ad0b012e66b76bf0d390218c1166f68ab853f31eced39b18ce523e30678a5b3e9d3c9bbcf81221bc71034a6e78adc0dfc6215160b94b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nlp-getting-started, 607343 bytes compressed
Downloaded and uncompressed: nlp-getting-started
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer as wnl
import nltk # imports the nltk library
nltk.download('wordnet') # downloads the wordnet resource

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

[nltk_data] Downloading package wordnet to /root/nltk_data...


/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv


In [None]:
# Initialize the training dataset and produce a development set from it
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_df, dev_df = train_test_split(train_data, test_size = 0.3)

# Pre-processing Data

In [None]:
# Converting text to lowercase
train_df[['text', 'keyword', 'location']] = train_df[['text', 'keyword', 'location']].apply(lambda x: x.str.lower())
dev_df[['text', 'keyword', 'location']] = dev_df[['text', 'keyword', 'location']].apply(lambda x: x.str.lower())

# Lemmatizing text in 'text' and 'keyword' columns of both datasets
for column in ['text', 'keyword']:
  train_df[column] = train_df[column].apply(lambda x: ' '.join([wnl().lemmatize(word) for word in x.split()]) if pd.notna(x) else x)
  dev_df[column] = dev_df[column].apply(lambda x: ' '.join([wnl().lemmatize(word) for word in x.split()])if pd.notna(x) else x)


In [None]:
# TRAIN DATA
# Strip URLs, @s, and #s (our decision)
train_df[['text', 'location']] = train_df[['text', 'location']].apply(lambda x: x.str.replace(r'http\S+', '', regex=True))
train_df['text'] = train_df['text'].replace(r'@\S+', '', regex=True)
train_df['text'] = train_df['text'].replace(r'#\S+', '', regex=True)

# Remove non-ascii characters (our decision)
train_df[['text', 'location', 'keyword']] = train_df[['text', 'location', 'keyword']].apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

# Remove punctuation & numbers (our decision)
train_df[['text', 'location', 'keyword']] = train_df[['text', 'location', 'keyword']].replace(r'[^\w\s]', '', regex=True)
train_df[['text', 'location', 'keyword']] = train_df[['text', 'location', 'keyword']].replace(r'[0-9]', '', regex=True)

# Remove stop words
stop_words = ['the', 'and', 'is', 'a', 'an', 'in', 'of', 'on', 'are', 'be', 'if', 'into', 'or', 'for']
train_df[['text', 'location', 'keyword']] = train_df[['text', 'location', 'keyword']].apply(lambda x: x.apply(lambda y: ' '.join([word for word in str(y).split() if word not in (stop_words)])))
train_df

Unnamed: 0,id,keyword,location,text,target
5815,8298,rubble,,my parent so impulsive sometimes i remember co...,0
5180,7392,obliterate,texas,watch sarah palin obliterate planned parenthoo...,0
7293,10434,whirlwind,bristol england,win lisowski whitewash whirlwind round shangha...,1
6273,8963,storm,,nike golf storm fit golf jacket black medium,0
3923,5579,flood,new york,w cree led work light bar alloy spot flood com...,0
...,...,...,...,...,...
4466,6349,hostage,,hostage meaningless might well just play cod s...,0
5224,7465,obliteration,new orleans louisiana,why did god order obliteration ancient canaanites,0
5630,8030,refugee,warri,cameroon repatriated nigerian refugee,1
4065,5777,forestfires,portland oregon,inciweb update rogue riversiskiyou national fo...,1


In [None]:
# DEV DATA
# Strip URLs, @s, and #s (our decision)
dev_df[['text', 'location']] = dev_df[['text', 'location']].apply(lambda x: x.str.replace(r'http\S+', '', regex=True))
dev_df['text'] = dev_df['text'].replace(r'@\S+', '', regex=True)
dev_df['text'] = dev_df['text'].replace(r'#\S+', '', regex=True)

# Remove non-ascii characters (our decision)
dev_df[['text', 'location', 'keyword']] = dev_df[['text', 'location', 'keyword']].apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

# Remove punctuation & numbers (our decision)
dev_df[['text', 'location', 'keyword']] = dev_df[['text', 'location', 'keyword']].replace(r'[^\w\s]', '', regex=True)
dev_df[['text', 'location', 'keyword']] = dev_df[['text', 'location', 'keyword']].replace(r'[0-9]', '', regex=True)

# Remove stop words
dev_df[['text', 'location', 'keyword']] = dev_df[['text', 'location', 'keyword']].apply(lambda x: x.apply(lambda y: ' '.join([word for word in str(y).split() if word not in (stop_words)])))
dev_df

Unnamed: 0,id,keyword,location,text,target
7420,10613,wounded,usa,police officer wounded suspect dead after exch...,1
2606,3741,destroyed,,oiled up as hole destroyed with king size cock...,0
4772,6790,lightning,asheboro nc,some crazy lightning outside,0
4168,5921,harm,kansas city,love what you picked were playing worth it by ...,0
733,1061,bleeding,live oak tx,yes im bleeding heart liberal,1
...,...,...,...,...,...
2816,4049,displaced,pedophile hunting ground,displaced at point no return like condition ih...,1
785,1138,blight,maryland usa,i agree but i knew wed going to deep road agai...,0
5304,7577,outbreak,akure city ondo state,family to sue over legionnaires more than fami...,1
6393,9137,suicidebomb,worldwide,th day since jul nigeria suicide bomb attack k...,1


# Bag of Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the training and development datasets
M = 10 # Threshold representing the number of tweets that must have a particular vocabulary word in order for that word to be included in the vocabulary
count_vect = CountVectorizer(binary=True, min_df=M)
X_train = count_vect.fit_transform(train_df['text']) # This fits on the training dataset and transforms data using learned vocabulary
X_dev = count_vect.transform(dev_df['text']) # This simply transforms data using learned vocabulary, without training on the development set
print(X_train.shape, X_dev.shape)

# The output is (5329, 1087) (2284, 1087). x-coordinate represents the number of rows (tweets); y-coordinate represents the number of columns(vocabulary words in BoW model)
# A feature vector phi(x) would be a row from the above matrix

(5329, 1082) (2284, 1082)


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

# NO REGULARIZATION
our_log_reg = LogisticRegression(max_iter=1000, penalty=None)
y_train = train_df['target']

# Train the model on our training data
our_log_reg.fit(X_train, y_train)

# Predict targets of testing data
y_dev_pred = our_log_reg.predict(X_dev)
y_train_pred = our_log_reg.predict(X_train)

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], y_dev_pred)
recall_dev = recall_score(dev_df['target'], y_dev_pred)

precision_train = precision_score(train_df['target'], y_train_pred)
recall_train = recall_score(train_df['target'], y_train_pred)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
f1_train = 2*(precision_train*recall_train)/(precision_train+recall_train)
print('F1-score for development set: ', f1_dev)
print('F1-score for training set: ', f1_train)

F1-score for development set:  0.694129763130793
F1-score for training set:  0.8518434743270752


In [None]:
# L1 REGULARIZATION
our_log_reg_l1 = LogisticRegression(max_iter=1000, penalty='l1', solver="liblinear")

# Train the model on our training data
our_log_reg_l1.fit(X_train, y_train)

# Predict targets of testing data
y_dev_pred = our_log_reg_l1.predict(X_dev)
y_train_pred = our_log_reg_l1.predict(X_train)

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], y_dev_pred)
recall_dev = recall_score(dev_df['target'], y_dev_pred)

precision_train = precision_score(train_df['target'], y_train_pred)
recall_train = recall_score(train_df['target'], y_train_pred)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
f1_train = 2*(precision_train*recall_train)/(precision_train+recall_train)
print('F1-score for development set: ', f1_dev)
print('F1-score for training set: ', f1_train)

F1-score for development set:  0.7073036792970895
F1-score for training set:  0.8141509433962264


In [None]:
# L2 REGULARIZATION - BEST
our_log_reg_l2 = LogisticRegression(max_iter=1000, penalty='l2', solver="liblinear")

# Train the model on our training data
our_log_reg_l2.fit(X_train, y_train)

# Predict targets of testing data
y_dev_pred = our_log_reg_l2.predict(X_dev)
y_train_pred = our_log_reg_l2.predict(X_train)

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], y_dev_pred)
recall_dev = recall_score(dev_df['target'], y_dev_pred)

precision_train = precision_score(train_df['target'], y_train_pred)
recall_train = recall_score(train_df['target'], y_train_pred)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
f1_train = 2*(precision_train*recall_train)/(precision_train+recall_train)
print('F1-score for development set: ', f1_dev)
print('F1-score for training set: ', f1_train)

F1-score for development set:  0.7107526881720432
F1-score for training set:  0.8221806270472625


In [None]:
# Identifying the most important words for deciding whether a tweet is about a real disaster or not
# This takes the coefficients for all words, returns the indices that would sort the coefficients,
# slices the array to get the last 10 elements in reverse order -> outputs the 10 largest coefficients
top_indices = our_log_reg_l1.coef_[0].argsort()[::-1]

vocab = count_vect.get_feature_names_out()
count = 0
for i in top_indices[:10]:
  count += 1
  print("#"+ str(count) +":" + vocab[i])


#1:migrant
#2:derailment
#3:typhoon
#4:wildfire
#5:spill
#6:hiroshima
#7:fires
#8:airport
#9:killed
#10:crew


# Bernoulli Naive Bayes

In [None]:
# Directly pulled from lecture notes
n = X_train.shape[0] # size of the dataset
d = X_train.shape[1] # number of features in our dataset
K = 2 # number of clases (either a disaster tweet or not; hence, bernoulli)
a = 1 # pseudo-count for laplace smoothening

# these are the shapes of the parameters; initializing phi matrix K x d and psi array of length K
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train[y_train == k] # data for class k
    psis[k] = (np.sum(X_k, axis=0) + a) / (X_k.shape[0] + 2*a) # probability (i.e., mean of each column for class k) of each feature word being present in class k
    phis[k] = X_k.shape[0]/ float(n) # prior probability of class k

# print out the class proportions
print(phis)

[0.57290298 0.42709702]


In [None]:
# Computing predictions using Bayes' rule and parameters derived above
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.

    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    # x = np.reshape(x, (1, n, d))
    x = x.toarray().reshape(1,n,d)
    psis = np.reshape(psis, (K, 1, d))

    psis = psis.clip(1e-14, 1-1e-14) # clip probabilities to avoid log(0)

    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

# Creating predictions for our development set
idx_dev, logpyx = nb_predictions(X_dev, psis, phis)
print(idx_dev[:10])

[1 0 1 0 0 0 1 0 0 1]


In [None]:
# Evaluating the model's F1 score on the development set

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], idx_dev)
recall_dev = recall_score(dev_df['target'], idx_dev)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
print('F1-score for development set: ', f1_dev)

F1-score for development set:  0.707505518763797


# N-gram Model

In [None]:
# Vectorize the training and development datasets
M = 2
count_vect = CountVectorizer(binary=True, min_df=M, ngram_range=(2,2))
X_train_ngram = count_vect.fit_transform(train_df['text'])
X_dev_ngram = count_vect.transform(dev_df['text'])
print(X_train_ngram.shape, X_dev_ngram.shape)
print("10 2-grams in our vocabulary: ", count_vect.get_feature_names_out()[:10])

(5329, 5183) (2284, 5183)
10 2-grams in our vocabulary:  ['aba woman' 'abandoned aircraft' 'abbswinston kidnapped' 'abc news'
 'abc online' 'ability to' 'able to' 'abomination that' 'about bridge'
 'about cable']


In [None]:
# Logistic regression on n-gram model
# L2 REGULARIZATION
our_log_reg_l2_ngram = LogisticRegression(max_iter=1000, penalty='l2', solver="liblinear")

# Train the model on our training data
our_log_reg_l2_ngram.fit(X_train_ngram, y_train)

# Predict targets of testing data
y_dev_pred = our_log_reg_l2_ngram.predict(X_dev_ngram)
y_train_pred = our_log_reg_l2_ngram.predict(X_train_ngram)

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], y_dev_pred)
recall_dev = recall_score(dev_df['target'], y_dev_pred)

precision_train = precision_score(train_df['target'], y_train_pred)
recall_train = recall_score(train_df['target'], y_train_pred)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
f1_train = 2*(precision_train*recall_train)/(precision_train+recall_train)
print('F1-score for development set: ', f1_dev)
print('F1-score for training set: ', f1_train)

F1-score for development set:  0.573107049608355
F1-score for training set:  0.8101071975497702


In [None]:
# Bernoulli Naive Bayes on n-gram model
n = X_train_ngram.shape[0] # size of the dataset
d = X_train_ngram.shape[1] # number of features in our dataset
K = 2 # number of clases (either a disaster tweet or not; hence, bernoulli)
a = 1 # pseudo-count for laplace smoothening

# these are the shapes of the parameters; initializing phi matrix K x d and psi array of length K
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train_ngram[y_train == k] # data for class k
    psis[k] = (np.sum(X_k, axis=0) + a) / (X_k.shape[0] + 2*a)
    phis[k] = X_k.shape[0]/ float(n)

# print out the class proportions
print("The phis: ", phis)

# Test on development set
idx_dev_ngram, logpyx = nb_predictions(X_dev_ngram, psis, phis)
print("The first 10 predictions: ", idx_dev_ngram[:10])

The phis:  [0.57290298 0.42709702]
The first 10 predictions:  [1 0 0 1 0 0 1 0 0 0]


In [None]:
# Evaluating the model's F1 score on the development set

# Calculating the precision and recall
precision_dev = precision_score(dev_df['target'], idx_dev_ngram)
recall_dev = recall_score(dev_df['target'], idx_dev_ngram)

# Calculatingt he F1 score (harmonic mean)
f1_dev = 2*(precision_dev*recall_dev)/(precision_dev+recall_dev)
print('F1-score for development set: ', f1_dev)

F1-score for development set:  0.5125989920806336


# Training in Kaggle Data

In [None]:
# BAG OF WORDS
M = 10 # Threshold representing the number of tweets that must have a particular vocabulary word in order for that word to be included in the vocabulary
count_vect = CountVectorizer(binary=True, min_df=M)
X_train = count_vect.fit_transform(train_data['text'])
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
X_test = count_vect.transform(test_data['text'])
print(X_train.shape, X_dev.shape)

(7613, 1550) (2284, 1082)


In [None]:
# L2 REGULARIZATION
our_log_reg_l2 = LogisticRegression(max_iter=1000, penalty='l2', solver="liblinear")

# Creating y_train
y_train = train_data['target']

# Train the model on our training data
our_log_reg_l2.fit(X_train, y_train)

# Predict targets of testing data
y_test_pred = our_log_reg_l2.predict(X_test)
output = pd.DataFrame({'id': test_data['id'], 'target': y_test_pred})
output.to_csv('submission_hw2.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
