# Project Outline

1. Import proper packages and open files
2. Clean the data
3. Create bag of words and lemmatize
4. Vectorize into one, two, and three word phrases
5. Implementing the Model

Then, implement the model on the judging data.

## Step One: Import proper packages and open files

In [1]:
# important packages to work with
import pandas as pd
import io
import numpy as np
import csv

# NLP packages to clean the data, lemmatize, and vectorize
import nltk                                                                     
import re
nltk.download('punkt')
from nltk.corpus import stopwords                                 # used to clean data
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer                   # used to lemmatize
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer       # used to create feature vectors

# sklearn packages for machine learning
from sklearn.model_selection import train_test_split              # used to create train-test split             
from sklearn.linear_model import LogisticRegression               # used to implement Logistic Regression
from sklearn.metrics import accuracy_score                        # used to get accuracy

# tensorflow packages for machine learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential                    # used to create the model
from tensorflow.keras.layers import Activation, Dense             # used to add layers to the neural net
from tensorflow.keras.optimizers import Adam                      # optimizer    
# used to prevent overfitting
from tensorflow.keras.callbacks import EarlyStopping 
from tensorflow.keras.layers import Dropout                       

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
train = pd.read_csv("/content/gdrive/My Drive/High School/Summer 2020/Ignition Hacks 2020/training_data.csv")
judging = pd.read_csv("/content/gdrive/My Drive/High School/Summer 2020/Ignition Hacks 2020/contestant_judgment.csv")

In [4]:
# Taking first look at the data
train.head()

Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1
2,584154,xxcharlx,no way i dont want the tour to end,0
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1
4,28609,umbec,@flockmaster they are chocolate,1


In [5]:
# Information about the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   ID         1000000 non-null  int64 
 1   User       1000000 non-null  object
 2   Text       1000000 non-null  object
 3   Sentiment  1000000 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 30.5+ MB


## Step Two: Clean the Data

In [6]:
# stop_words contains typical sentence stopping words from the english language
stop_words = set(stopwords.words('english'))

# contains all the filtered (good) words
filtered_sent = []

for index, row in train.iterrows():
    tokenized_words = []
    # Remove @s
    train.at[index, 'Text'] = re.sub(r"@[A-z]+", " ", train.at[index, 'Text'])
    
    # Remove punctuation
    train.at[index, 'Text'] = re.sub(r"\W", " ", train.at[index, 'Text'])

    # Remove numbers
    train.at[index, 'Text'] = re.sub(r"\d+", "", train.at[index, 'Text'])

    # Remove spaces
    train.at[index, 'Text'] = re.sub(r"\s+", " ", train.at[index, 'Text'])
   
    # Make everything lowercase
    train.at[index, 'Text'] = str(train.at[index, 'Text']).lower()
    
    # Tokenize
    tokenized_words = nltk.word_tokenize(train.at[index, 'Text'])
    
    # Appending all the filtered words to filtered_sent
    filtered_sent.append([])
    for w in tokenized_words:
        if w not in stop_words:
            filtered_sent[index].append(w)

## Step Three: Create Bag of Words and Lemmatize

In [7]:
# Getting lemmatizer from nltk library
lemmatizer = WordNetLemmatizer()

# looping through filtered_sent and lemmatizing each of the words
# lemmatizing means to sort words with similar stem or form
i = 0
for words in filtered_sent:
    newWords = []

    # Lemmatizing the words in each list of filtered words
    for word in words:
        newWords.append(lemmatizer.lemmatize(word, pos = 'v'))
     
    # putting the lemmatized words back in the filtered_send list
    filtered_sent[i] = " ".join(newWords)
    i += 1

In [8]:
# checking if lemmatization worked

print(filtered_sent[0])

heart fill desk mean sales amp desk


## Step Four: Vectorize into one and two-word phrases

In [9]:
# creating a new column in the dataframe with the lemmatized bag of words
new_column = pd.DataFrame({'BagofWords': filtered_sent})
train = train.merge(new_column, right_index = True, left_index=True)
train.head()

Unnamed: 0,ID,User,Text,Sentiment,BagofWords
0,864192,Carly_FTS,i heart filling up desk it means sales amp it ...,1,heart fill desk mean sales amp desk
1,523691,Open_Sourcing,sociomat people create prettier younger and b...,1,sociomat people create prettier younger better...
2,584154,xxcharlx,no way i dont want the tour to end,0,way dont want tour end
3,1527961,andreapuddu,hi amazing brother sending limitless love you...,1,hi amaze brother send limitless love way twitt...
4,28609,umbec,they are chocolate,1,chocolate


In [10]:
# creating original features, next step is vectorization
bagofwords = train.BagofWords.tolist()

# creating labels
sentiment = train.Sentiment.tolist()

In [11]:
# creating a CountVectorizer for 400 single-word phrases
cv = CountVectorizer(max_features=400)

# creating a CountVectorizer for 400 two or three-word phrases phrases
cv2 = CountVectorizer(max_features=400, ngram_range=(2, 3))

# concatenating the arrays that have formed small vectors to get a large feature vector
x = np.concatenate((cv.fit_transform(train['BagofWords']).toarray(), cv2.fit_transform(train['Text']).toarray()), axis=1)

# labels
y = train['Sentiment'].values

"\n\n# getting the feature names from the count vectorizer\nheader_1 = cv.get_feature_names()\nheader_2 = cv2.get_feature_names()\n\n# creating two output dataframes for both count vectorizers\noutput_1 = pd.DataFrame(x_1, columns = header_1)\noutput_2 = pd.DataFrame(x_2, columns = header_2)\n\n# merging the two dataframes into one\noutput_1 = output_1.merge(output_2, right_index=True, left_index=True)\n\n# adding the labels column to the dataframe\noutput_1['Sentiment'] = train['Sentiment']\n\noutput_1.head()\n\n"

# Step 5: Model

In [13]:
# splitting the data into train and test using sklearn
# train: 75%, test: 25%

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [14]:
# Implementing a basic ML model (logistic regression)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=700)

# fitting the classifier on X_train and y_train

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=700,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# making predictions with the classifier on X_test
predictions = classifier.predict(X_test)

from sklearn.metrics import accuracy_score

# using sklearn to get the accuracy (since this is a classification task  )
accuracy_score(y_test, predictions)

0.737308

#### Creating an Artificial Neural Network with Tensorflow


In [16]:
# Splitting the train data into train and validation sets
# validation loss is constantly being checked to prevent overfitting
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

# Using model = Sequential() to add layers to our neural net
model = Sequential()

# Input layer: input shape is (800,) and activation function is relu
model.add(Dense(units=256, input_shape=X_train[0].shape, activation='relu'))

# hidden layers: no activation functions
model.add(Dense(units=128, activation=None))
model.add(Dense(units=64, activation=None))
model.add(Dense(units=48, activation=None))
model.add(Dense(units=32, activation=None))
model.add(Dense(units=16, activation=None))
model.add(Dense(units=12, activation=None))
model.add(Dense(units=8, activation=None))
model.add(Dense(units=4, activation=None))

# output layer: sigmoid activation function
model.add(Dense(units=1, activation='sigmoid'))

# compiling the model
# adam optimizer, binary crossentropy loss
# also will allow us to see the accuracy while the model is being trained
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# early_stopping monitors validation loss to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss')

# printing out the summary of the model with all the layers
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               205056    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 48)                3120      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1568      
_________________________________________________________________
dense_5 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_6 (Dense)              (None, 12)                2

In [17]:
# Training the model on the train data
# 10 epochs with a validation split of 0.25

history = model.fit(X_train, y_train, epochs=10, verbose=1, validation_split=0.25)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
preds = model.predict(X_test)

In [None]:
len(preds)

200000

In [19]:
predictions = []

for i in range(len(preds)):
  if preds[i] >= 0.5:
    predictions.append(1)
  else:
    predictions.append(0)

In [20]:
accuracy_score(y_test, predictions)

0.724508

# Make Predictions on the Judging Dataframe


#### Doing the same pre-processing

In [24]:
# stop_words contains typical sentence stopping words from the english language
stop_words = set(stopwords.words('english'))

# contains all the filtered (good) words
filtered_sent = []

for index, row in judging.iterrows():
    tokenized_words = []
    # Remove @s
    judging.at[index, 'Text'] = re.sub(r"@[A-z]+", " ", judging.at[index, 'Text'])
    
    # Remove punctuation
    judging.at[index, 'Text'] = re.sub(r"\W", " ", judging.at[index, 'Text'])

    # Remove numbers
    judging.at[index, 'Text'] = re.sub(r"\d+", "", judging.at[index, 'Text'])

    # Remove spaces
    judging.at[index, 'Text'] = re.sub(r"\s+", " ", judging.at[index, 'Text'])
   
    # Make everything lowercase
    judging.at[index, 'Text'] = str(judging.at[index, 'Text']).lower()
    
    # Tokenize
    tokenized_words = nltk.word_tokenize(judging.at[index, 'Text'])
    
    # Appending all the filtered words to filtered_sent
    filtered_sent.append([])
    for w in tokenized_words:
        if w not in stop_words:
            filtered_sent[index].append(w)

#### Doing the same lemmatization

In [25]:
# Getting lemmatizer from nltk library
lemmatizer = WordNetLemmatizer()

# looping through filtered_sent and lemmatizing each of the words
# lemmatizing means to sort words with similar stem or form
i = 0
for words in filtered_sent:
    newWords = []

    # Lemmatizing the words in each list of filtered words
    for word in words:
        newWords.append(lemmatizer.lemmatize(word, pos = 'v'))
     
    # putting the lemmatized words back in the filtered_send list
    filtered_sent[i] = " ".join(newWords)
    i += 1

In [26]:
judging['BagofWords'] = filtered_sent

#### Vectorization

In [42]:
# creating original features, next step is vectorization
bagofwords = judging.BagofWords.tolist()

In [29]:
# creating a CountVectorizer for 400 single-word phrases
cv = CountVectorizer(max_features=400)

# creating a CountVectorizer for 400 two or three-word phrases phrases
cv2 = CountVectorizer(max_features=400, ngram_range=(2, 3))

# concatenating the arrays that have formed small vectors to get a large feature vector
x = np.concatenate((cv.fit_transform(judging['BagofWords']).toarray(), cv2.fit_transform(judging['Text']).toarray()), axis=1)

#### Making the predictions

In [35]:
test_predictions = classifier.predict(x)

for i in range(len(test_predictions)):
  if test_predictions[i] >= 0.5:
    test_predictions[i] = 1
  else:
    test_predictions[i] = 0

In [36]:
test_nn_predictions = model.predict(x)

for i in range(len(test_nn_predictions)):
  if test_nn_predictions[i] >= 0.5:
    test_nn_predictions[i] = 1
  else:
    test_nn_predictions[i] = 0

#### Put the predictions into the final


In [40]:
judging['Sentiment'] = test_predictions

In [41]:
judging.to_csv('judging.csv')
!cp judging.csv "gdrive/My Drive/High School/Summer 2020/Ignition Hacks 2020/judging.csv"