In [None]:
import pyopencl as cl #ONLY RUN THIS IF YOU'RE USING GPU

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
pre_df = pd.read_csv('./data/pre_soccer_replaced.csv')
post_df = pd.read_csv('./data/post_soccer_replaced.csv')

In [None]:
pre_df=pre_df[pre_df['ptitle'].isna()==False]
post_df=post_df[pre_df['ptitle'].isna()==False]

In [None]:
def TokenizeProcess(df):
    word_lemmatizer = WordNetLemmatizer()

    df_tk = pd.DataFrame(columns=['ptitle', 'pscore', 'pid', 'pbody', 'pcreated', 'comment', 'cauthor', 'ccreated'])

    for (ptitle, pscore, pid, pbody, pcreated, comment, cauthor, ccreated) in df.values.tolist():
        # Tokenize
        tokens = word_tokenize(str(comment).lower())

        # Strip punctuation
        punctuation_list = str.maketrans('', '', string.punctuation)
        tokens_strp = [w.translate(punctuation_list) for w in tokens]

        # Remove other non-alphabetic tokens
        words = [word for word in tokens_strp if word.isalpha()]

        # Stop words
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]

        # Lemmatize
        # Note: We might need to do this for the brands we want to detect as well
        words_lm = [word_lemmatizer.lemmatize(w) for w in words]
        df_tk = df_tk.append({"ptitle": ptitle, 'pscore':pscore, 'pid':pid, 'pbody':pbody, 'pcreated': pcreated, 'comment': words_lm, 'cauthor': cauthor, 'ccreated': ccreated} ,ignore_index=True) 
    return df_tk

In [None]:
pre_df_tk = TokenizeProcess(pre_df)

In [None]:
from datetime import datetime
pre_df=pre_df_tk
pre_df['pcreated_date']=[datetime.fromtimestamp(x) for x in pre_df['pcreated']]
post_df['pcreated_date']=[datetime.fromtimestamp(x) for x in post_df['pcreated']]
pre_df['ccreated_date']=[datetime.fromtimestamp(x) for x in pre_df['ccreated']]
#post_df['ccreated']=[datetime.fromtimestamp(x) for x in post_df['ccreated']]

# 1. I have fix dates 
# 2. Seperate the teams out from title, pickle the files
# 3. Anu -> sentiment analysis
# 4. Sid -> Report starting
# 5. Bog -> front-end, matching


teams = pd.read_csv('./data/teams.csv')
def getinvolved(df,column):
    allteams=[]
    missed=[]
    for title in df[column]:
        involvedteams=[]
        for teamname,teamfull in teams.values:
            if(teamname in title or teamfull in title):
                involvedteams.append(teamname)
        involvedteams=list(set(involvedteams))
        involvedteams.sort()
        if(len(involvedteams)!=2):
            print(title,involvedteams)
            missed.append(title)
            allteams.append("None")
        else:
            allteams.append(" vs ".join(involvedteams))
    df['involved_teams'] = allteams
    df=df[df['involved_teams']!="None"].reset_index(drop=True)
    return(df)



pre_df=getinvolved(pre_df,'ptitle')
post_df=getinvolved(post_df,'ptitle')
pre_post=pd.merge(pre_df,post_df[['ptitle','pcreated','pcreated_date','involved_teams']],left_on="involved_teams",right_on="involved_teams",how='left')
pre_post['diff'] = pre_post['pcreated_date_y']-pre_post['pcreated_date_x']

pre_post['diff'] = [x.days for x in pre_post['diff']]
pre_post=pre_post[(pre_post['diff'].isna()==False) & (pre_post['diff']<=20)]
pre_df.drop_duplicates("pid").to_csv("./data/unique_pre.csv")

#

In [None]:
pre_df.drop_duplicates("ptitle").to_csv("unique_pre.csv")

In [None]:
#post_df_tk = TokenizeProcess(post_df)

In [None]:
pre_df_tk.to_csv("./data/pre_soccer_tokenized.csv",index=False)
#post_df_tk.to_csv("./data/post_soccer_tokenized.csv",index=False)

In [None]:
import pickle
pickle.dump( pre_df_tk, open( "./data/pre_df_tk.p", "wb" ) )
