# CaixaBank Hackaton - classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## import data train

In [2]:
df = pd.read_csv("train.csv")
df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df.drop("Date", axis=1, inplace=True)
df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Target,year,month,day
0,3615.199951,3654.699951,3581.0,3654.5,3654.496338,0.0,0,1994,1,3
1,3654.5,3675.5,3625.100098,3630.300049,3630.296387,0.0,1,1994,1,4
2,3625.199951,3625.199951,3583.399902,3621.199951,3621.196289,0.0,1,1994,1,5
3,,,,,,,0,1994,1,6
4,3621.199951,3644.399902,3598.699951,3636.399902,3636.39624,0.0,1,1994,1,7


## Import tweets dataframe

In [3]:
tweet_df = pd.read_csv("tweets_from2015_#Ibex35.csv")
tweet_df['tweetDate'] = pd.to_datetime(tweet_df['tweetDate'], errors='coerce')
tweet_df = tweet_df[tweet_df.notna()]
tweet_df['tweetDate'] = tweet_df['tweetDate'].apply(lambda x: str(x)[:10])
tweet_df['tweetDate'] = pd.to_datetime(tweet_df['tweetDate'], format="%Y-%m-%d")
tweet_df['year'] = tweet_df['tweetDate'].dt.year
tweet_df['month'] = tweet_df['tweetDate'].dt.month
tweet_df['day'] = tweet_df['tweetDate'].dt.day
tweet_df.drop("tweetDate", axis=1, inplace=True)
tweet_df.head()

Unnamed: 0,handle,text,year,month,day
0,abelac62,He hecho el repaso de todos los componentes de...,2022.0,4.0,9.0
1,LluisPerarnau,Els projectes que han presentat les empreses d...,2022.0,4.0,7.0
2,Pegaso121080,"Por si no lo has visto, o no lo encuentras en ...",2022.0,4.0,4.0
3,zonavalue,📈 #BOLSA: El #Ibex35 abre en 🟢 \n\n🇪🇸 #Ibex35 ...,2022.0,4.0,5.0
4,EPeconomia,"El #Ibex35 retrocede un 0,4% en marzo y un 3,0...",2022.0,3.0,31.0


## Import data test

In [4]:
test_df = pd.read_csv("test_x.csv")
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df['year'] = test_df['Date'].dt.year
test_df['month'] = test_df['Date'].dt.month
test_df['day'] = test_df['Date'].dt.day
test_df.drop("Date", axis=1, inplace=True)
test_df.head()

Unnamed: 0,test_index,Open,High,Low,Close,Adj Close,Volume,year,month,day
0,6557,9136.799805,9173.400391,9095.0,9150.5,9150.5,158753000.0,2019,6,5
1,6558,9169.200195,9246.200195,9136.700195,9169.200195,9169.200195,212720900.0,2019,6,6
2,6559,9186.700195,9261.400391,9185.700195,9236.099609,9236.099609,150664700.0,2019,6,7
3,6560,9284.200195,9302.200195,9248.099609,9294.099609,9294.099609,102323700.0,2019,6,10
4,6561,9288.599609,9332.5,9273.400391,9282.099609,9282.099609,144701200.0,2019,6,11


## Traducir tweets a ingles
Esta traducción es necesaria para que nltk pueda hacer correctamente el analisis de sentimientos

Con NLTK reemplazo el tweet a un puntaje de -1 a 1, donde los valores negativos son sentimientos malos, los valores positivos son sentimientos buenos y 0 no clasificados.

El siguiente paso es hacer un merge del dataset original con su puntaje asociado

In [5]:
!pip install googletrans==3.1.0a0
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from googletrans import Translator

translator = Translator()
tweet_df["text1"] = tweet_df["text"].apply(lambda x: translator.translate(x).text)

sid = SentimentIntensityAnalyzer()
tweet_df["score"] = tweet_df["text1"].apply(lambda x: sid.polarity_scores(x)["compound"])
tweet_df.drop(["handle", "text", "text1"], axis=1, inplace=True)

df = df.merge(tweet_df, how="left")
df.head()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Target,year,month,day,score
0,3615.199951,3654.699951,3581.0,3654.5,3654.496338,0.0,0,1994,1,3,
1,3654.5,3675.5,3625.100098,3630.300049,3630.296387,0.0,1,1994,1,4,
2,3625.199951,3625.199951,3583.399902,3621.199951,3621.196289,0.0,1,1994,1,5,
3,,,,,,,0,1994,1,6,
4,3621.199951,3644.399902,3598.699951,3636.399902,3636.39624,0.0,1,1994,1,7,


en otro dataframe se copia el original y se eliminan las filas con valores nulos

In [6]:
df2 = df.copy()
df2.dropna(inplace=True)

## Entrenamiento del dataset con 6 modelo

In [7]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score

X = df2.drop("Target", axis=1)
y = df2['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

modelos_class = {
    "Logit": LogisticRegression(),
    "DecTree": DecisionTreeClassifier(),
    "RFC": RandomForestClassifier(),
    "AdaBoostC": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
}

for name, model in modelos_class.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"F1-score {name:<11}: {f1_score(y_test, y_pred):.3f}")

F1-score Logit      : 0.694
F1-score DecTree    : 0.957
F1-score RFC        : 0.960
F1-score AdaBoostC  : 0.723
F1-score XGBoost    : 0.827
F1-score LightGBM   : 0.949


## Voting classifier
Este modelo convina varios para tener un puntaje convinado, se utilizan:

1. Decision Tree classifier
2. Random forest classifier
3. LightGBM classifier

In [8]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

estimators = [
  ("DecTree",DecisionTreeClassifier()),
  ("RFC",RandomForestClassifier()),
  ("LightGBM",LGBMClassifier())
]

vc = VotingClassifier(estimators=estimators, voting="hard")
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
print(f"F1-score: {f1:.3f}")

F1-score: 0.962


> El reslultado final es de un F1-score de 0.96

## Prediccion del data test

In [9]:
test_df = test_df.merge(tweet_df, how="left")
test_df.fillna(df['score'].mean(), inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.head()

Unnamed: 0,test_index,Open,High,Low,Close,Adj Close,Volume,year,month,day,score
0,6557,9136.799805,9173.400391,9095.0,9150.5,9150.5,158753000.0,2019,6,5,0.0
3,6557,9136.799805,9173.400391,9095.0,9150.5,9150.5,158753000.0,2019,6,5,0.4588
5,6557,9136.799805,9173.400391,9095.0,9150.5,9150.5,158753000.0,2019,6,5,0.3956
7,6558,9169.200195,9246.200195,9136.700195,9169.200195,9169.200195,212720900.0,2019,6,6,0.4215
8,6558,9169.200195,9246.200195,9136.700195,9169.200195,9169.200195,212720900.0,2019,6,6,0.5574


> las filas se repiten por que hay varios tweets en un solo dia por lo que se calcula el promedio para ver la situación en ese día especifico.

In [10]:
new_scores = test_df.groupby("test_index")['score'].mean().values

In [11]:
test_df.drop("score", axis=1, inplace=True)
test_df.drop_duplicates(inplace=True)
test_df['score'] = new_scores
test_df.head()

Unnamed: 0,test_index,Open,High,Low,Close,Adj Close,Volume,year,month,day,score
0,6557,9136.799805,9173.400391,9095.0,9150.5,9150.5,158753000.0,2019,6,5,0.2848
7,6558,9169.200195,9246.200195,9136.700195,9169.200195,9169.200195,212720900.0,2019,6,6,0.27582
12,6559,9186.700195,9261.400391,9185.700195,9236.099609,9236.099609,150664700.0,2019,6,7,0.041725
17,6560,9284.200195,9302.200195,9248.099609,9294.099609,9294.099609,102323700.0,2019,6,10,-0.20315
22,6561,9288.599609,9332.5,9273.400391,9282.099609,9282.099609,144701200.0,2019,6,11,0.00816


In [12]:
result = pd.DataFrame({
    "test_index": test_df.iloc[:,0],
    "Target": vc.predict(test_df.iloc[:,1:])
})
result

Unnamed: 0,test_index,Target
0,6557,0
7,6558,0
12,6559,0
17,6560,0
22,6561,1
...,...,...
3957,7278,1
3958,7279,1
3960,7280,1
3961,7281,1


In [13]:
result.to_csv("predictions.csv", index=False)
result.to_json("predictions.json")