In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
df_ecb_sent=pd.read_csv('../raw_data/ECB_prelabelled_sent.txt')
df_fed_sent=pd.read_csv('../raw_data/FED_prelabelled_sent.txt')
df_bis_sent=pd.read_csv('../raw_data/BIS_prelabelled_sent.txt')
df_ecb_ag=pd.read_csv('../raw_data/ECB_prelabelled.txt')
df_fed_ag=pd.read_csv('../raw_data/FED_prelabelled.txt',lineterminator='\n')
df_bis_ag=pd.read_csv('../raw_data/BIS_prelabelled.txt')

df_fed_sent.drop(columns=['audience'],inplace=True)
df_fed_sent.head()
df_bis_ag.head()
df_bis_ag.drop(columns=['Unnamed: 0'],inplace=True)
df_ecb_ag.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
df_sent=pd.concat([df_ecb_sent,df_fed_sent,df_bis_sent],axis=0)
df_sent.head()
df_sent.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13458 entries, 0 to 4211
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       13458 non-null  object
 1   sentiment  13458 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 315.4+ KB


In [8]:
df_ag=pd.concat([df_ecb_ag,df_fed_ag,df_bis_ag],axis=0)
df_ag.columns=['text','agent']
df_ag.head()
df_ag.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15060 entries, 0 to 4803
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15060 non-null  object
 1   agent   15060 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 353.0+ KB


In [9]:
X1_train, X1_test, y1_train, y1_test = train_test_split(df_sent['text'], df_sent['sentiment'], test_size=0.2, random_state=42, stratify=df_sent['sentiment'])
X2_train, X2_test, y2_train, y2_test = train_test_split(df_ag['text'], df_ag['agent'], test_size=0.2, random_state=42, stratify=df_ag['agent'])

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline

In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
words_to_keep=['no','not','none',
    'not',
    'none',
    'neither',
    'never',
    'nobody',
    'nothing',
    'nowhere']
filtered_stop_words = [word for word in stop_words if word not in words_to_keep]

def clean (text,remove_stopwords=True):
    text=text.split()
    if remove_stopwords:
        words = [word for word in text if word not in filtered_stop_words] # Remove Stop Words
    else:
        words=text
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in words] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned


In [12]:
X1_train_clean=X1_train.apply(lambda x: clean(x,remove_stopwords=False))
X2_train_clean=X2_train.apply(clean)
X1_test_clean=X1_test.apply(lambda x: clean(x,remove_stopwords=False))
X2_test_clean=X2_test.apply(clean)

In [13]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
X1_train_vec=tf_idf_vectorizer.fit_transform(X1_train)
X2_train_vec=tf_idf_vectorizer.fit_transform(X2_train)

In [15]:
from sklearn.svm import SVC
svm = SVC()

In [16]:
best_pipeline_svm1=make_pipeline(tf_idf_vectorizer,SVC(kernel='rbf',C=10, gamma=0.1))
best_pipeline_svm1.fit(X1_train_clean, y1_train)

In [17]:
import pickle

# save
with open('svm_sentiment.pkl','wb') as f:
    pickle.dump(best_pipeline_svm1,f)

In [18]:
best_pipeline_svm2=make_pipeline(tf_idf_vectorizer,SVC(kernel='rbf',C=10, gamma=0.5))
best_pipeline_svm2.fit(X2_train_clean, y2_train)

In [19]:
# save
with open('svm_agent.pkl','wb') as f:
    pickle.dump(best_pipeline_svm2,f)

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

2024-09-05 14:19:00.404645: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-05 14:19:00.407653: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-05 14:19:00.443803: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-05 14:19:00.513223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-05 14:19:00.599158: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [27]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts(X1_train)

In [28]:
with open('Tokenizer_sentiment.pkl','wb') as f:
    pickle.dump(tokenizer,f)

In [29]:
tokenizer_ag= Tokenizer()

tokenizer_ag.fit_on_texts(X2_train)

In [30]:
with open('Tokenizer_agent.pkl','wb') as f:
    pickle.dump(tokenizer_ag,f)