# TITLE 

## By Sarah Prusaitis, Rick Lataille, and Allison Ward

## Overview
overview/project intro here

## Business Problem
business problem paragraph here

## Data Limitations
data limitations info here

# EDA

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

import re
import string
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer, word_tokenize, regexp_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# Read original data
df1 = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding = 'ISO-8859-1')

In [3]:
# Read synthetic data
df2 = pd.read_csv('data/Apple_Product_Negative_ Tweets_Sheet1.csv', encoding = 'ISO-8859-1')

In [4]:
# Rename columns for simplicity
df1 = df1.rename(columns = {'tweet_text': 'tweet', 
                         'emotion_in_tweet_is_directed_at': 'product', 
                         'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'})

In [5]:
# Combine rows into a single DataFrame
df = pd.concat([df1, df2], ignore_index = True)

In [6]:
df['tweet'] = df['tweet'].astype(str)

In [7]:
def replace_emoticons(text):
    # Define a dictionary mapping emoticons to their corresponding meanings
    emoticon_mapping = {
        ':D': 'emojismile',
        ':)': 'emojismile',
        ':-D': 'emojismile',
        ':\'': 'emojiunsure',
        ':p': 'emojitongue',
        ':P': 'emojitongue',
        ':(': 'emojisad'
        # Add more emoticons and their meanings as needed
    }
    pattern = re.compile('|'.join(re.escape(emoticon) for emoticon in emoticon_mapping.keys()))
    
    def replace(match):
        return emoticon_mapping[match.group(0)]

    return pattern.sub(replace, text)

In [8]:
# Replace emoticons with mapped strings
df['tweet'] = df['tweet'].apply(replace_emoticons)

In [9]:
def preprocess_tweet(tweet):
    # Remove links and mentions
    tweet = re.sub(r'http\S+|@\S+', '', tweet)
    
    # Remove {link}
    tweet = re.sub(r'\{link\}', '', tweet)
    
    # Replace &quot; with "
    tweet = tweet.replace('&quot;', '"')
    
    # Remove extra space between quotation mark and words
    tweet = re.sub(r'\s+"', '"', tweet)
    tweet = re.sub(r'"\s+', '"', tweet)
    
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    
    # Remove punctuation
    tweet = re.sub(r'([^\w\s]|_)+', ' ', tweet)
    
    # Tokenize
    tokens = nltk.word_tokenize(tweet)
    
    # Part-of-speech tagging
    tagged_tokens = nltk.pos_tag(tokens)
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for word, pos in tagged_tokens:
        if pos.startswith('J'):
            pos = 'a'  # Adjective
        elif pos.startswith('V'):
            pos = 'v'  # Verb
        elif pos.startswith('N'):
            pos = 'n'  # Noun
        elif pos.startswith('R'):
            pos = 'r'  # Adverb
        else:
            pos = 'n'  # Default to noun
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_tokens.append(lemma)
    
    # Add additional stopwords
    additional_stopwords = {'w', 'u', 'amp', 'sxsw', 'rt'}  # amp = & 
    stop_words = set(stopwords.words('english')) | additional_stopwords
    
    # Remove stopwords
    tweet = [word for word in tokens if word not in stop_words]
    
    return tweet

In [10]:
df['tweet'] = df['tweet'].astype(str).apply(preprocess_tweet)

In [11]:
# Combined and renamed Apple products and non Apple products 

df['product'] = df['product'].replace({
    'iPad': 'Apple',
    'Apple': 'Apple',
    'iPad or iPhone App': 'Apple',
    'iPhone': 'Apple',
    'Other Apple product or service': 'Apple',
    'Google': 'Other',
    'Other Google product or service': 'Other',
    'Android App': 'Other',
    'Android': 'Other'
})
#there are 5802 rows that are null - what should we do with those?

In [12]:
# Filter DataFrame for only Apple tweets
df_apple = df[df['product']=='Apple'].reset_index(drop=True)

In [13]:
# Consolidate no emotion entries, and drop
df_apple['sentiment'] = df_apple['sentiment'].replace("I can't tell", "No emotion toward brand or product")
df_apple = df_apple.drop(df_apple[df_apple['sentiment'] == 'No emotion toward brand or product'].index).reset_index(drop=True)

In [14]:
# Label target with 1's and 0's
df_apple['target'] = df_apple['sentiment'].map({'Positive emotion': 1, 'Negative emotion': 0})

In [15]:
# train test split
X = df_apple['tweet']
y = df_apple['target'] # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [16]:
# Save tokenized data for Doc2Vec vectorizer
tokenized_train_data = X_train
tokenized_test_data = X_test

In [17]:
# Convert tokenized tweets back into strings for TfidfVectorizer
X_train_str = X_train.apply(lambda x: ' '.join(x))
X_test_str = X_test.apply(lambda x: ' '.join(x))

### TF-IDF Vectorization

In [20]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()  #ngram_range=(1,2)

# Fit and transform the vectorizer on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_str)
X_test_tfidf = tfidf_vectorizer.transform(X_test_str)

### Logistic Regression

In [21]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_tfidf, y_train)

In [22]:
y_preds_lr = logreg.predict(X_train_tfidf)
y_test_preds_lr = logreg.predict(X_test_tfidf)

In [23]:
print(classification_report(y_train, y_preds_lr))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95      1511
           1       0.93      0.98      0.96      1539

    accuracy                           0.95      3050
   macro avg       0.96      0.95      0.95      3050
weighted avg       0.96      0.95      0.95      3050



In [24]:
print(classification_report(y_test,y_test_preds_lr))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88       353
           1       0.89      0.91      0.90       410

    accuracy                           0.89       763
   macro avg       0.89      0.89      0.89       763
weighted avg       0.89      0.89      0.89       763



### Multinomial Naive Bayes

In [25]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

In [26]:
y_preds_nmb = mnb.predict(X_train_tfidf)
y_preds_test_nmb = mnb.predict(X_test_tfidf)

In [27]:
print(classification_report(y_train, y_preds_nmb))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95      1511
           1       0.93      0.98      0.96      1539

    accuracy                           0.95      3050
   macro avg       0.96      0.95      0.95      3050
weighted avg       0.96      0.95      0.95      3050



In [28]:
print(classification_report(y_test, y_preds_test_nmb))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       353
           1       0.89      0.94      0.91       410

    accuracy                           0.90       763
   macro avg       0.91      0.90      0.90       763
weighted avg       0.90      0.90      0.90       763



### Support Vector Machines

In [29]:
svc = SVC(random_state=42)
svc.fit(X_train_tfidf, y_train)

In [30]:
y_preds_svc = svc.predict(X_train_tfidf)
y_preds_test_svc = svc.predict(X_test_tfidf)

In [31]:
print(classification_report(y_train, y_preds_svc))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1511
           1       0.99      1.00      0.99      1539

    accuracy                           0.99      3050
   macro avg       0.99      0.99      0.99      3050
weighted avg       0.99      0.99      0.99      3050



In [32]:
print(classification_report(y_test, y_preds_test_svc))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       353
           1       0.90      0.92      0.91       410

    accuracy                           0.91       763
   macro avg       0.91      0.91      0.91       763
weighted avg       0.91      0.91      0.91       763



In [33]:
params = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
          'degree':[2,3,4],
          'shrinking':[True,False],
         }

In [34]:
svc_grid = GridSearchCV(svc, param_grid=params, cv=5)

In [35]:
start = time.time()
svc_grid.fit(X_train_tfidf, y_train)
end = time.time()
print(f'{end-start} seconds')

76.40418481826782 seconds


In [37]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(degree=2, kernel='poly', random_state=42)
{'degree': 2, 'kernel': 'poly', 'shrinking': True}


In [38]:
svc_tuned = SVC(degree=2, kernel='poly', shrinking=True, random_state=42)
svc_tuned.fit(X_train_tfidf, y_train)

In [39]:
y_preds_svc = svc_tuned.predict(X_train_tfidf)
y_preds_test_svc = svc_tuned.predict(X_test_tfidf)

In [40]:
print(classification_report(y_train, y_preds_svc))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1511
           1       0.99      1.00      1.00      1539

    accuracy                           1.00      3050
   macro avg       1.00      1.00      1.00      3050
weighted avg       1.00      1.00      1.00      3050



In [41]:
print(classification_report(y_test, y_preds_test_svc))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       353
           1       0.91      0.93      0.92       410

    accuracy                           0.91       763
   macro avg       0.91      0.91      0.91       763
weighted avg       0.91      0.91      0.91       763



### Random Forest

In [42]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

In [43]:
y_preds_rf = rf.predict(X_train_tfidf)
y_preds_test_rf = rf.predict(X_test_tfidf)

In [44]:
print(classification_report(y_train, y_preds_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1511
           1       1.00      1.00      1.00      1539

    accuracy                           1.00      3050
   macro avg       1.00      1.00      1.00      3050
weighted avg       1.00      1.00      1.00      3050



In [45]:
print(classification_report(y_test, y_preds_test_rf))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       353
           1       0.91      0.92      0.91       410

    accuracy                           0.91       763
   macro avg       0.91      0.91      0.91       763
weighted avg       0.91      0.91      0.91       763



In [46]:
rf_params = {'n_estimators':[10, 50, 100],
             'criterion':['gini','entropy','log_loss'],
             'max_depth':[5,10,20]
            }

In [47]:
rf_grid = GridSearchCV(rf, param_grid=rf_params, cv=5)

In [48]:
rf_grid.fit(X_train_tfidf, y_train)

In [52]:
print(rf_grid.best_estimator_)
print(rf_grid.best_params_)

RandomForestClassifier(criterion='log_loss', max_depth=20)
{'criterion': 'log_loss', 'max_depth': 20, 'n_estimators': 100}


In [53]:
rf2 = RandomForestClassifier(criterion='log_loss', max_depth=20, n_estimators=100)
rf2.fit(X_train_tfidf, y_train)

In [54]:
y_preds_rf2 = rf2.predict(X_train_tfidf)
y_preds_test_rf2 = rf2.predict(X_test_tfidf)

In [55]:
print(classification_report(y_train, y_preds_rf2))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1511
           1       0.87      0.91      0.89      1539

    accuracy                           0.89      3050
   macro avg       0.89      0.89      0.89      3050
weighted avg       0.89      0.89      0.89      3050



In [56]:
print(classification_report(y_test, y_preds_test_rf2))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       353
           1       0.86      0.87      0.86       410

    accuracy                           0.85       763
   macro avg       0.85      0.85      0.85       763
weighted avg       0.85      0.85      0.85       763



### Doc2Vec Vectorization

In [60]:
# Tag tokenized_tweets with an index for identification
tagged_train_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_train_data)]

# Initialize and train a Doc2Vec vectorizer
start = time.time()
vectorizer = Doc2Vec(tagged_train_data, vector_size=50, window=2, min_count=1, workers=4, epochs=40)
end = time.time()
print(f'{end-start} seconds')

10.887912034988403 seconds


In [62]:
vectors = [vectorizer.dv[i] for i in range(len(tagged_train_data))]

AttributeError: 'Doc2Vec' object has no attribute 'dv'

In [65]:
print(gensim.__version__)

3.8.3


In [64]:
type(vectorizer)

gensim.models.doc2vec.Doc2Vec