In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# read the data
df = pd.read_csv('https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# how many rows and columns are in the data set?
df.shape

(7613, 5)

In [4]:
# how many speeches are there from the United States?
df[df['country'] == 'USA'].shape

KeyError: 'country'

In [None]:
# how many speeches are there from CANADA?
df[df['country'] == 'CAN'].shape

(46, 7)

In [None]:
# show rows for the United States
# df[df['country'] == 'USA']

In [None]:
# show rows for Canada
# df[ df["country_name"] == 'Canada' ]

In [None]:
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english'))

include_stopwords = {'dear', 'regards', 'must', 'would', 'also', 
                     'canada', 'canadian', 'canadians', 'prime', 'minister', 'province', 'provinces'
                     'united', 'states', 'state', 'america', 'american', 'president','secretary'}
exclude_stopwords = {'against'}

stopwords |= include_stopwords
stopwords -= exclude_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# build a text processing and classifier pipeline
# to predict the country (USA or Canada) of a speech

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

df2 = df[df['country'].isin(['USA', 'CAN'])]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df2['text'], df2['country'], test_size=0.2)

# Create a pipeline that first transforms the text data into TF-IDF vectors, then applies SVM
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords))),
    ('clf', svm.SVC()),
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['USA', 'CAN']))


              precision    recall  f1-score   support

         USA       0.89      1.00      0.94         8
         CAN       1.00      0.91      0.95        11

    accuracy                           0.95        19
   macro avg       0.94      0.95      0.95        19
weighted avg       0.95      0.95      0.95        19



In [None]:
# This script creates a new column 'sentiment' in the dataframe, 
# which contains the sentiment score of the text. 
# The sentiment score is a float within the range [-1.0, 1.0], 
# where -1.0 denotes a very negative sentiment, 
# 1.0 denotes a very positive sentiment, 
# and values around 0 denote a neutral sentiment.

from textblob import TextBlob

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # returns a value between -1 and 1

# Create a new column 'sentiment' in the DataFrame
df2['sentiment'] = df2['text'].apply(get_sentiment)

# Display the DataFrame
# df2

In [None]:
# find average sentiment for each country in df2
df2.groupby('country')['sentiment'].mean()

country
CAN    0.112540
USA    0.110635
Name: sentiment, dtype: float64

In [None]:
# find average sentiment for each speaker in df2
# order the results from most positive to most negative

# df2.groupby('speaker')['sentiment'].mean().sort_values(ascending=False).head(5)

In [None]:
# df2.groupby('year')['sentiment'].mean().sort_values(ascending=False)