### Setting Up for Success


In [4]:
##############################################
# Importing Modules / Loading Libraries
##############################################
# To make use of the functions in a module, we'll need to bring in the module
# with an import statement in our current program as a separate namespace. 
# Later on, we can refer to the function in dot notation -  [module].[function].

# general
import io
import sys
import re
import timeit
import string

# for data analysis and manipulation
import pandas as pd 
import numpy as np 

# for sentiment analysis
import nltk
from nltk import PorterStemmer # popular stemming algorithm
from sklearn.feature_extraction.text import CountVectorizer #implements both tokenization and occurrence counting in a single class
from sklearn.model_selection import train_test_split # for spliting training and validation dataset
from sklearn.metrics import f1_score # used to assess machine learning model performance

# for cleaner output
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
########################################
# upload training data file for review
########################################

### use following for google colab to upload training data
from google.colab import files 
uploaded = files.upload()

Saving training_data.csv to training_data (3).csv


In [6]:
### use locally to read training data
# train = pd.read_csv(r'training_data.csv')

### use following for google colab to read training data
train = pd.read_csv(io.BytesIO(uploaded['training_data.csv'])) 
print(train)

# create copy to not lose original
train_original=train.copy()

########################################
# Data Set Description
########################################
#
# ID: The ID associated with the tweets in the given dataset
# User: Username associated with the tweet 
# Text: The body of the tweet having either positive or negative sentiments associated with it
# Sentiment: A tweet with Sentiment '0' is negative while a tweet with Sentiment '1' is positive 

             ID  ... Sentiment
0        864192  ...         1
1        523691  ...         1
2        584154  ...         0
3       1527961  ...         1
4         28609  ...         1
...         ...  ...       ...
999995  1366175  ...         1
999996   681828  ...         1
999997   488988  ...         1
999998   985613  ...         0
999999  1410994  ...         0

[1000000 rows x 4 columns]


In [7]:
########################################
# upload test data file for review
########################################

### use following for google colab to upload test data
# upload contestant_judgment.csv data file for review
from google.colab import files 
uploaded = files.upload()

Saving contestant_judgment.csv to contestant_judgment (1).csv


In [8]:
### use locally to read test data
# test = pd.read_csv(r'contestant_judgment.csv')

### use following for google colab to read test data
test = pd.read_csv(io.BytesIO(uploaded['contestant_judgment.csv'])) 
print(test)

# create copy to not lose original
test_original=test.copy()

########################################
# Data Set Description
########################################
#
# ID: The ID associated with the tweets in the given dataset
# User: Username associated with the tweet 
# Text: The body of the tweet having either positive or negative sentiments associated with it

             ID  ...                                               Text
0        599303  ...  I'm on my way to miss kacy's 4th bday party at...
1        359673  ...  @ripestapple  I might not be the right person ...
2        391095  ...                           @zomgkris I know it is. 
3        820049  ...  Mii ViSioN is BLuRRy...iM goiN to Bed!!NiTe Ni...
4        658429  ...  @tealou anyways - i did something good for som...
...         ...  ...                                                ...
599994  1664039  ...                          is waiting to go to work 
599995   140294  ...  For 4 years i've been only an hour from syracu...
599996  1466481  ...  @SBWinner oh I'm sry for u  two years ago we h...
599997   148005  ...  @amy_beee he can`t think of a reason how you l...
599998   606477  ...                    Going holiday shopping! Yipee! 

[599999 rows x 3 columns]


### Data Preprocessing
Data preprocessing helps create better models and predictions. There are four stages: data cleaning, data integration, data reduction, and data transformation (normalization, aggregation, generalization).

In [9]:
#############################################
# Step 1 - combine data into one data frame
#############################################
# Combine the train.csv and test.csv files to run scripts over one dataframe

combine = train.append(test,ignore_index=True)

In [10]:
# show how contestant_judgment.csv is appended to the training_data.csv file
# columns not in the original dtaframes are added as new columns and the
# new cells are populated with NaN value (see Sentiment column for rows from test.csv

# view first few rows of combined dataset
combine.head()

Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1.0
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1.0
2,584154,xxcharlx,no way i dont want the tour to end,0.0
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1.0
4,28609,umbec,@flockmaster they are chocolate,1.0


In [11]:
# view last few rows of combined dataset
combine.tail()

Unnamed: 0,ID,User,Text,Sentiment
1599994,1664039,airsoft_gunner,is waiting to go to work,
1599995,140294,mlynnfrank,For 4 years i've been only an hour from syracu...,
1599996,1466481,xxAnnaSxx,@SBWinner oh I'm sry for u two years ago we h...,
1599997,148005,Kiddylicious,@amy_beee he can`t think of a reason how you l...,
1599998,606477,SpangleyGarbage,Going holiday shopping! Yipee!,


In [12]:
#############################################
# Step 2 - remove twitter handles (@user)
#############################################

# define function that takes two arguments to:
# >>> remove unwated text patterns
# >>> return same input string but without given pattern
# first argument text is the original text string
# second argument is pattern of text

def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

# remove twitter handles and place new Text into Tidy_Tweets column
# NumPy vectorization is used for speed 
combine['Tidy_Tweets'] = np.vectorize(remove_pattern)(combine['Text'], "@[\w]*")

In [13]:
########################################
# Data Set Description
########################################
#
# ID: The ID associated with the tweets in the given dataset
# Text: The body of the tweet having either positive or negative sentiments associated with it
# Sentiment: A tweet with Sentiment '0' is negative while a tweet with Sentiment '1' is positive 
# Tidy_Tweets: A tweet that has been cleaned and 'tidied up' so to speak 

# show current combined dataset 
combine.head()

Unnamed: 0,ID,User,Text,Sentiment,Tidy_Tweets
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1.0,I *heart* filling up desk 1 it means sales ...
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1.0,"#SocioMat - people create prettier, younger an..."
2,584154,xxcharlx,no way i dont want the tour to end,0.0,no way i dont want the tour to end
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1.0,Hi Amazing Brother! Sending Limitless (((Love...
4,28609,umbec,@flockmaster they are chocolate,1.0,they are chocolate


In [14]:
################################################################
# Step 3 - remove punctuation, numbers, and special characters
################################################################
# take out miscellaneous text characters that don't help us analyze sentiment

# replace punctuation, numbers,and special characters with space
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].str.replace("[^a-zA-Z#]", " ")

# show current combined dataset with 10 rows
combine.head(10)

Unnamed: 0,ID,User,Text,Sentiment,Tidy_Tweets
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1.0,I heart filling up desk it means sales ...
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1.0,#SocioMat people create prettier younger an...
2,584154,xxcharlx,no way i dont want the tour to end,0.0,no way i dont want the tour to end
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1.0,Hi Amazing Brother Sending Limitless Love...
4,28609,umbec,@flockmaster they are chocolate,1.0,they are chocolate
5,1284501,rt_nuggets,@cHuMeee Just got home! And from what I've hea...,1.0,Just got home And from what I ve heard MY LA...
6,971375,Sweetcakes16,@chrisettefan yup yup especially when its i.t,0.0,yup yup especially when its i t
7,1597758,alanakf,i have a new love: matt lanter of 90210. absol...,1.0,i have a new love matt lanter of absol...
8,14016,MLM216,Graham's car looked really &quot;snakey&quot; ...,0.0,Graham s car looked really quot snakey quot ...
9,1408251,Tickle_EAC,Why some people make me be evil towards them. ...,0.0,Why some people make me be evil towards them ...


In [15]:
################################################################
# Step 4 - remove stop words
################################################################
# take out short words that generally don't have much meaning like "and" or "oh"

# remove stop words
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# show current combined dataset with 10 rows
combine.head(10)

Unnamed: 0,ID,User,Text,Sentiment,Tidy_Tweets
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1.0,heart filling desk means sales desk
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1.0,#SocioMat people create prettier younger bette...
2,584154,xxcharlx,no way i dont want the tour to end,0.0,dont want tour
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1.0,Amazing Brother Sending Limitless Love Your Tw...
4,28609,umbec,@flockmaster they are chocolate,1.0,they chocolate
5,1284501,rt_nuggets,@cHuMeee Just got home! And from what I've hea...,1.0,Just home from what heard LAKERS BEAT NUGGETS ...
6,971375,Sweetcakes16,@chrisettefan yup yup especially when its i.t,0.0,especially when
7,1597758,alanakf,i have a new love: matt lanter of 90210. absol...,1.0,have love matt lanter absolutely adore swoon s...
8,14016,MLM216,Graham's car looked really &quot;snakey&quot; ...,0.0,Graham looked really quot snakey quot replays
9,1408251,Tickle_EAC,Why some people make me be evil towards them. ...,0.0,some people make evil towards them hate that


In [16]:
################################################################
# Step 5 - tokenize text to set up for later
################################################################
# tokenize cleaned tweets to perform techniques like stemming in next step
# this breaks up strings into a list of words

# tokenize using lambda function (which can take in any number of arguments and
# returns value of a single expression)
tokenized_tweet = combine['Tidy_Tweets'].apply(lambda x: x.split())

# show a sample of tokenized tweets
tokenized_tweet.head()

0           [heart, filling, desk, means, sales, desk]
1    [#SocioMat, people, create, prettier, younger,...
2                                   [dont, want, tour]
3    [Amazing, Brother, Sending, Limitless, Love, Y...
4                                    [they, chocolate]
Name: Tidy_Tweets, dtype: object

In [None]:
################################################################
# Step 6 - apply stemming technique
################################################################
# stemming is a rule-based process of removing suffixes like "ing", "ly", and "es"
# for example, "going" and "goes" are different variations of the word "go"

# define stemmer
ps = PorterStemmer()

# stem tokenized tweet
tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

# show a sample of tokenized tweets with stemming algorithm applied
tokenized_tweet.head()

In [None]:
# stitch tokens back together by joining
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combine['Tidy_Tweets'] = tokenized_tweet

# show a sample of tokenized tweets
combine.head()

In [None]:
################################################################
# Step 7 - understand impact of hashtags on tweet sentiment
################################################################
# define a function that collects hashtags

# collect hashtags
def Hashtags_Extract(x):
    hashtags=[]
    
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r'#(\w+)',i)
        hashtags.append(ht)
    
    return hashtags

In [None]:
# nested list of hashtags from positive sentiments
ht_positive = Hashtags_Extract(combine['Tidy_Tweets'][combine['Sentiment']==1])

# display nested list of hashtags from positive sentiments
ht_positive

In [None]:
# unnest list of hashtags from positive sentiments
ht_positive_unnest = sum(ht_positive,[])

In [None]:
# nested list of hashtags from negative sentiments
ht_negative = Hashtags_Extract(combine['Tidy_Tweets'][combine['Sentiment']==0])

# display nested list of hashtags from negative sentiments
ht_negative

In [None]:
#unnest list of hashtags from negative sentiments
ht_negative_unnest = sum(ht_negative,[])

In [None]:
# counting frequency of words with positive sentiment
word_freq_positive = nltk.FreqDist(ht_positive_unnest)

# display frequency of words with positive sentiment
word_freq_positive

In [None]:
# create dataframe for most frequently used words in hashtags for positive sentiments
df_positive = pd.DataFrame({'Hashtags':list(word_freq_positive.keys()),'Count':list(word_freq_positive.values())})

# display dataframe for used words in hashtags for positive sentiments
df_positive.head(10)

Unnamed: 0,Hashtags,Count
0,sociomat,2
1,peterfacinelli,3
2,hoppusday,46
3,elevensestim,21
4,bush,1
5,basstip,3
6,karnevel,1
7,phpkonferenca,4
8,pink,1
9,twtrcon,12


In [None]:
# counting frequency of words with negative sentiment
word_freq_negative = nltk.FreqDist(ht_negative_unnest)

# display frequency of words with negative sentiment
word_freq_negative

FreqDist({'fail': 203,
          'applefail': 4,
          'attfail': 3,
          'premidlifecrisi': 1,
          'susanboyl': 3,
          'twune': 1,
          'tenni': 9,
          'monday': 3,
          'trackl': 98,
          'canuck': 10,
          'asylm': 25,
          'haloodst': 1,
          'ussocc': 2,
          'imu': 7,
          'whackwednesday': 1,
          'petewentzday': 12,
          'visialvoicemail': 1,
          'iranelect': 288,
          'splogin': 4,
          'squarespac': 426,
          'squrespac': 1,
          'wmata': 3,
          'neda': 16,
          'home': 2,
          'spymast': 44,
          'followfriday': 101,
          'phenom': 1,
          'hero': 4,
          'ineedanewmac': 1,
          'masterchef': 25,
          'asot': 91,
          'sux': 1,
          'fear': 2,
          'stock': 1,
          'vietnam': 1,
          'jona': 2,
          'gymtim': 1,
          'workingtim': 1,
          'pakcricket': 51,
          'theonlinemom': 6,
    

In [None]:
# create dataframe for most frequently used words in hashtags for negative sentiments
df_negative = pd.DataFrame({'Hashtags':list(word_freq_negative.keys()),'Count':list(word_freq_negative.values())})

# display dataframe for used words in hashtags for negative sentiments
df_negative.head(10)

Unnamed: 0,Hashtags,Count
0,fail,203
1,applefail,4
2,attfail,3
3,premidlifecrisi,1
4,susanboyl,3
5,twune,1
6,tenni,9
7,monday,3
8,trackl,98
9,canuck,10


### Extracting Features from Tidy Tweets

In [None]:
################################################################
# Bag of Words
################################################################
# Bag of Words is a method to extract features from text documents for
# training machine learning algorithms by creating a vocabulary of all unique
# words occurring in all documents in the training set
#
# N.B. number of occurrence and not sequence or order of words matters in this approach

# apply word embedding
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=100, stop_words='english')


In [None]:
# display how Tidy Tweet column currently looks like
combine['Tidy_Tweets']

0                             heart fill desk mean sale desk
1          #sociomat peopl creat prettier younger better ...
2                                             dont want tour
3          amaz brother send limitless love your twitterl...
4                                                they chocol
                                 ...                        
1599994                                            wait work
1599995    year been onli hour from syracus closer exhibi...
1599996           year wintercamp ski long summercamp better
1599997              think reason learn about that fact find
1599998                                 go holiday shop yipe
Name: Tidy_Tweets, Length: 1599999, dtype: object

In [None]:
### bag-of-words feature matrix
# Think of each row as a tweet with columns headed by the different words in the tweets

bow = bow_vectorizer.fit_transform(combine['Tidy_Tweets'])

df_bow = pd.DataFrame(bow.todense())

df_bow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1599995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1599996,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1599997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
################################################################
# split dataset into training and validation
################################################################
# Split dataset into training and validation so that we can train and test model
# before applying it to predict for unseen and unlabeled test data

# use features from Bag-of-Words for training set
train_bow = bow[:1000000]
train_bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Bag of Words features
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_bow,train['Sentiment'],test_size=0.3,random_state=2)

### Apply Machine Learning Model

In [None]:
################################################################
# logistic regression model
################################################################

# fit logistic regression model
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')

In [None]:
# bag of words features
Log_Reg.fit(x_train_bow,y_train_bow)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# predict the probabilities for a tweet falling into either positive or negative class
prediction_bow = Log_Reg.predict_proba(x_valid_bow)

prediction_bow

array([[0.57173883, 0.42826117],
       [0.82795164, 0.17204836],
       [0.48874096, 0.51125904],
       ...,
       [0.48874096, 0.51125904],
       [0.64459655, 0.35540345],
       [0.48172733, 0.51827267]])

In [None]:
### calculate F1 score
# if prediction is greater than or equal to 0.3 then 1 else 0
# Where 0 is for negative sentiment tweets and 1 for positive sentiment tweets
prediction_int = prediction_bow[:,1]>=0.3

# converting the results to integer type
prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_bow = f1_score(y_valid_bow, prediction_int)

log_bow

0.7057750592797479

In [None]:
################################################################
# term frequency-inverse document frequency
################################################################
# aka TF-IDF
# TF-IDF is a weight used in information retrieval as a statistical measure
# to evaluate how important a word is to a document in a corpus (body of text
# being considered). Importance increases proportionally to the number of times
# a word appears in the document but is offset by the frequency of the word
# in the corpus.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=500,stop_words='english')

tfidf_matrix=tfidf.fit_transform(combine['Tidy_Tweets'])

df_tfidf = pd.DataFrame(tfidf_matrix.todense())

df_tfidf


NameError: ignored

In [None]:
train_tfidf_matrix = tfidf_matrix[:1000000]
train_tfidf_matrix.todense()

In [None]:
# split dataset into train and validation set
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(train_tfidf_matrix,train['label'],test_size=0.3,random_state=17)

In [None]:
################################################################
# XGBoost model
################################################################

from xgboost import XGBClassifier

# bag of words features
model_bow = XGBClassifier(random_state=22,learning_rate=0.9)

In [None]:
# fitting the XGBoost Model
model_bow.fit(x_train_bow, y_train_bow)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.9, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=22,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# predict the probability of a tweet falling into either positive or negative class
xgb = model_bow.predict_proba(x_valid_bow)
xgb

array([[0.50283307, 0.49716693],
       [0.7185204 , 0.28147963],
       [0.48625076, 0.51374924],
       ...,
       [0.48625076, 0.51374924],
       [0.68031967, 0.31968033],
       [0.47792143, 0.5220786 ]], dtype=float32)

In [None]:
### calculate F1 score
# if prediction is greater than or equal to 0.3 then 1 else 0
# Where 0 is for negative sentiment tweets and 1 for positive sentiment tweets
xgb=xgb[:,1]>=0.3

# converting the results to integer type
xgb_int=xgb.astype(np.int)
prediction_int

# calculating f1 score
xgb_bow = f1_score(y_valid_bow, xgb_int)

xgb_bow

IndexError: ignored

### Predict Results for Test Data

In [None]:
test_tfidf = tfidf_matrix[1000000:]
test_pred = Log_Reg.predict_proba(test_tfidf)

test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)

test['Sentiment'] = test_pred_int

submission = test[['ID','Sentiment']]
submission.to_csv('result.csv', index=False)