In [46]:
import Cython
from Cython.Compiler import Options
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Cython.Compiler.Options.get_directive_defaults()['linetrace'] = True
Cython.Compiler.Options.get_directive_defaults()['binding'] = True

%reload_ext Cython

In [56]:
%%cython
cimport numpy as np
import pandas as pd
from textblob import TextBlob
import sys
import tweepy
import matplotlib.pyplot as plt
import numpy as np
import os
import nltk
import pycountry
import re
import string
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from datetime import timedelta
import datetime
import pytz
import time
nltk.download('vader_lexicon')
from keys import *

start_time = time.time()

client = tweepy.Client(bearer_token=bearer_token,\
                       consumer_key = consumer_key,\
                       consumer_secret = consumer_secret,\
                       access_token = access_token,\
                       access_token_secret = access_token_secret)

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

df_data = pd.read_csv("finalNbaData.csv").drop('Unnamed: 0', axis=1)
df_data.dateTime = pd.to_datetime(df_data.dateTime)
utc=pytz.UTC
weekPriorDate = datetime.datetime.now() - timedelta(days=6)
weekPriorDate = utc.localize(weekPriorDate) 

df_data.dateTime = df_data.dateTime.apply(lambda x: x.replace(tzinfo=None))
study_games = df_data[df_data.dateTime > weekPriorDate.replace(tzinfo=None)].__id__.unique()

def percentage(double part, double whole):
    cdef double prod = 100 * float(part)/float(whole)
    return prod 

def clean_tweets(tweet_list):
    tweet_list = [re.sub('RT @\w+: '," ",x) for x in tweet_list]
    tweet_list = [re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x) for x in tweet_list]
    tweet_list = [x.lower() for x in tweet_list]
    return tweet_list


def run(game_id):
    df = df_data[df_data.__id__ == game_id]
    home_team = df['homeTeam'].values[0]
    away_team = df['awayTeam'].values[0]

    game_datetime = pd.to_datetime(df['dateTime'].values[0])
    game_datetime_end = game_datetime
    game_datetime_start = game_datetime - timedelta(hours=5, minutes=0)
    
    
    print (game_id, home_team, away_team)
    
    # home tweets 
    home_tweet_list = []
    cdef int home_negative = 0
    cdef int home_positive = 0
    cdef int home_neutral = 0
    
    for tweet in tweepy.Paginator(client.search_recent_tweets, query=home_team,
                              tweet_fields=['lang', 'created_at'], max_results=100, start_time=game_datetime_start, end_time=game_datetime_end).flatten(limit=search_limit):

        if tweet.lang == 'en':
            home_tweet_list.append(tweet.text)
     
    cdef int home_tweet_count = len(set(home_tweet_list))
    
    for tweet in clean_tweets(list(set(home_tweet_list))):
        score = SentimentIntensityAnalyzer().polarity_scores(tweet)
        neg = score['neg']
        neu = score['neu']
        pos = score['pos']

        if neg > pos:
            home_negative += 1
        elif pos > neg:
            home_positive += 1
        elif pos == neg:
            home_neutral += 1
    
    # away tweets
    away_tweet_list = []
    cdef int away_negative = 0
    cdef int away_positive = 0
    cdef int away_neutral  = 0
    
    for tweet in tweepy.Paginator(client.search_recent_tweets, query=away_team,
                              tweet_fields=['lang', 'created_at'], max_results=100, start_time=game_datetime_start, end_time=game_datetime_end).flatten(limit=search_limit):

        if tweet.lang == 'en':
            away_tweet_list.append(tweet.text)
    
    cdef int away_tweet_count = len(set(away_tweet_list))
    
    for tweet in clean_tweets(list(set(away_tweet_list))):
        score = SentimentIntensityAnalyzer().polarity_scores(tweet)
        neg = score['neg']
        neu = score['neu']
        pos = score['pos']

        if neg > pos:
            away_negative += 1
        elif pos > neg:
            away_positive += 1
        elif pos == neg:
            away_neutral += 1
    
    return pd.DataFrame([[game_id, home_team, away_team, percentage(home_positive, home_tweet_count), percentage(home_negative, home_tweet_count), percentage(home_neutral, home_tweet_count),
           percentage(away_positive, away_tweet_count), percentage(away_negative, away_tweet_count), percentage(away_neutral, away_tweet_count), away_tweet_count + home_tweet_count]], 
           columns=['gameID', 'homeTeam', 'awayTeam','homePositive', 'homeNegative', 'homeNeutral', 'awayPositive', 'awayNegative', 'awayNeutral', 'total_tweets_analyzed'])




sentiment_results = []
cdef int search_limit = 500
for s in study_games:
    sentiment_results.append(run(s))
end_time = time.time()
print(end_time - start_time, 'seconds')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/farrisatif/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


BOSMIL512022 Boston Celtics Milwaukee Bucks
BOSMIL532022 Boston Celtics Milwaukee Bucks
MEMGS512022 Memphis Grizzlies Golden State Warriors
MEMGS532022 Memphis Grizzlies Golden State Warriors
MIAPHI522022 Miami Heat Philadelphia 76ers
MIAPHI542022 Miami Heat Philadelphia 76ers
PHODAL522022 Phoenix Suns Dallas Mavericks
PHODAL542022 Phoenix Suns Dallas Mavericks
70.7258129119873 seconds


In [61]:
# sentiment_results = pd.concat(sentiment_results)
sentiment_results

Unnamed: 0,gameID,homeTeam,awayTeam,homePositive,homeNegative,homeNeutral,awayPositive,awayNegative,awayNeutral,total_tweets_analyzed
0,BOSMIL512022,Boston Celtics,Milwaukee Bucks,38.565022,6.278027,55.156951,37.198068,2.898551,59.903382,430
0,BOSMIL532022,Boston Celtics,Milwaukee Bucks,54.166667,5.833333,40.0,55.421687,3.212851,41.365462,489
0,MEMGS512022,Memphis Grizzlies,Golden State Warriors,48.214286,3.571429,48.214286,53.080569,1.895735,45.023697,435
0,MEMGS532022,Memphis Grizzlies,Golden State Warriors,36.842105,11.004785,52.15311,46.031746,8.730159,45.238095,461
0,MIAPHI522022,Miami Heat,Philadelphia 76ers,54.393305,7.949791,37.656904,50.490196,6.862745,42.647059,443
0,MIAPHI542022,Miami Heat,Philadelphia 76ers,39.483395,13.653137,46.863469,32.272727,15.0,52.727273,491
0,PHODAL522022,Phoenix Suns,Dallas Mavericks,42.748092,11.068702,46.183206,56.569343,4.744526,38.686131,536
0,PHODAL542022,Phoenix Suns,Dallas Mavericks,49.765258,9.859155,40.375587,47.027027,7.567568,45.405405,398


In [142]:
df = copy.deepcopy(df_data)
ss = copy.deepcopy(sentiment_results)
df2 = copy.deepcopy(df)

df = df.drop(['sport', 'awayDiff', 'awayOutcome', 'gameState', 'homeDiff', 'sbId', 'score', 'total', '__id__', 'dateTime', 'under'], 1)
df = df.rename(columns={'id': 'gameID'})
df = df[['gameID', 'homeTeam', 'awayTeam', 'awayOdds', 'awaySpread', 'awaySpreadOdds', 'homeOdds', 'homeSpread', 'homeSpreadOdds', 'over', 'overOdds', 'underOdds', 'homeOutcome']]
df['y'] = np.where(df.homeOutcome == 'W', 1, 0)
df = df.drop('homeOutcome', 1)
df = pd.merge(df, ss, on=['gameID', 'homeTeam', 'awayTeam'], how='left')
df = df.drop('total_tweets_analyzed', 1)
df = df.fillna(0)
df.awayOdds = np.where(df.awayOdds < 0, 1 - (100 / df.awayOdds), 1 + (df.awayOdds / 100))
df.homeOdds = np.where(df.homeOdds < 0, 1 - (100 / df.homeOdds), 1 + (df.homeOdds / 100))
df.overOdds = np.where(df.overOdds < 0, 1 - (100 / df.overOdds), 1 + (df.overOdds / 100))
df.underOdds = np.where(df.underOdds < 0, 1 - (100 / df.underOdds), 1 + (df.underOdds / 100))
X = df.iloc[:, 3:].drop('y', 1)
y = df.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8095238095238095

In [143]:
df = copy.deepcopy(df_data)
ss = copy.deepcopy(sentiment_results)

df = df.drop(['sport', 'awayDiff', 'awayOutcome', 'gameState', 'homeDiff', 'sbId', 'score', 'total', '__id__', 'dateTime', 'under'], 1)
df = df.rename(columns={'id': 'gameID'})
df = df[['gameID', 'homeTeam', 'awayTeam', 'awayOdds', 'awaySpread', 'awaySpreadOdds', 'homeOdds', 'homeSpread', 'homeSpreadOdds', 'over', 'overOdds', 'underOdds', 'homeOutcome']]
df['y'] = np.where(df.homeOutcome == 'W', 1, 0)
df = df.drop('homeOutcome', 1)
df = pd.merge(df, ss, on=['gameID', 'homeTeam', 'awayTeam'], how='left')
df = df.drop('total_tweets_analyzed', 1)
df = df.fillna(0)
df = df.drop(['homePositive', 'homeNegative',
       'homeNeutral', 'awayPositive', 'awayNegative', 'awayNeutral'], axis=1)
df.awayOdds = np.where(df.awayOdds < 0, 1 - (100 / df.awayOdds), 1 + (df.awayOdds / 100))
df.homeOdds = np.where(df.homeOdds < 0, 1 - (100 / df.homeOdds), 1 + (df.homeOdds / 100))
df.overOdds = np.where(df.overOdds < 0, 1 - (100 / df.overOdds), 1 + (df.overOdds / 100))
df.underOdds = np.where(df.underOdds < 0, 1 - (100 / df.underOdds), 1 + (df.underOdds / 100))
X = df.iloc[:, 3:].drop('y', 1)
y = df.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)
model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7142857142857143