In [35]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score, recall_score,classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [36]:
# tf/idf data
tf_idf = pd.read_csv("/home/andrea/Desktop/neotec/data/sentiment_analysis/tf_idf/labelled_convers_tf_idf_monograms.csv")
tf_idf.drop('Unnamed: 0',inplace=True,axis = 1)
tf_idf.head(2)

Unnamed: 0,conversation_id,aa,abajo,abatido,abc,abdomen,abierto,aborrecido,abrazo,abril,...,zapato,zaragoz,zarza,zinquillo,zipi,zipperface,zona,zono,zorrilla,primary_label
0,0H9OEQs318oQ55Rn0SvK103578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,negative
1,0H9OEQs318oQ55Rn0SvK913137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral


### Training, validation and test

In [37]:
# Create sets
X = tf_idf.drop(columns=["conversation_id","primary_label"], axis = 1)
y = tf_idf["primary_label"]

In [39]:
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

logistic_regression_results = pd.DataFrame()

parameters_column = []
score_column = []
intercept_column = []
coef_column = []
auc_column = []
cm_column = []

for i in range(40):


    # Divide data in training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
    #print(Counter(y_train),Counter(y_test))

    # Delete features with 0 variance
    feat_vars = np.var(X_train, axis=0)
    X_tr_ = X_train.loc[:, feat_vars != 0]
    X_tst_ = X_test.loc[:, feat_vars != 0]
    n, d_ = X_tr_.shape
    print(n, d_)

    # Normalize data
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_tr_)
    X_test_std = scaler.transform(X_tst_)

    # Validate parameters
    model = LogisticRegression(penalty = "l1",solver='liblinear',class_weight = "balanced")
    parameters_grid = {'C':np.logspace(-3, 1, 30)}

    clf = GridSearchCV(model,parameters_grid,cv = 10, n_jobs = -1,scoring='roc_auc_ovr')
    clf.fit(X_train_std, y_train)

    # Best parameters
    best_parameters = clf.best_params_ 
    best_score = clf.best_score_ 
    best_estimator = clf.best_estimator_

    intercept = best_estimator.intercept_
    coeficiente = best_estimator.coef_
    #print(coeficiente) -> 3 clasificadores, matriz 3x200

    best_estimator.fit(X_train_std,y_train)

    # Prediction
    y_pred = best_estimator.predict(X_test_std)

    # AUC-ROC
    r_probs = [0 for i in range(len(y_test))]
    y_pred_prob =best_estimator.predict_proba(X_test_std)
    mean_auc = roc_auc_score(y_test, y_pred_prob, average="weighted", multi_class='ovr')
    print(mean_auc)
    auc_column.append(mean_auc)
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_column.append(cm)
    print(cm)

    # Save data
    parameters_column.append(best_parameters)
    score_column.append(best_score)
    intercept_column.append(intercept)
    coef_column.append(coeficiente)

logistic_regression_results["best_score"] = score_column
logistic_regression_results["best_parameters"] = parameters_column
logistic_regression_results["coef"] = coef_column
logistic_regression_results["intercept"] = intercept_column
logistic_regression_results["test_auc"] = auc_column
logistic_regression_results["test_confusion_matrix"] = cm_column

logistic_regression_results.to_csv("/content/drive/MyDrive/sentiment_analysis/logistic_regression_results.csv",float_format="%.6f")

1179 3357
0.7992613937289765
[[16 18  2]
 [ 7 45 10]
 [ 1  9 23]]
1179 3372
0.7825432752015599
[[10  9  1]
 [ 8 43 14]
 [ 0 16 30]]


ValueError: Length of values (2) does not match length of index (1)

In [40]:
score_column

[0.8223996391047959, 0.8290807133870304]

In [None]:
test = pd.DataFrame(list(zip(y_test,y_pred)),columns = ["test","pred"])
display(test)

In [None]:
display(logistic_regression_results)

Unnamed: 0,best_score,best_parameters,coef,intercept,test_accuracy,test_micro_precision,test_micro_recall
0,0.698824,{'C': 385.6620421163472},"[[0.0, 0.0, -0.0023830418007437827, 0.0, 0.0, ...","[-5.082346893874195, 2.9860560085312216, -9.53...",0.717949,0.717949,0.717949
1,0.70479,{'C': 239.5026619987486},"[[0.0, 0.0, -0.004314109056791422, 0.0, 0.0, 0...","[-4.632432313873381, 2.96831004991816, -9.4903...",0.74359,0.74359,0.74359
2,0.692773,{'C': 148.73521072935117},"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-4.809198539173711, 2.7173854094870165, -8.95...",0.769231,0.769231,0.769231
3,0.701008,{'C': 92.36708571873865},"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-4.764845370994286, 2.8444133583559252, -8.53...",0.666667,0.666667,0.666667
4,0.712857,{'C': 239.5026619987486},"[[0.0, 0.0, -0.08072862152947174, 0.0, 0.0, 0....","[-4.889965053063966, 2.7868782439733524, -8.94...",0.74359,0.74359,0.74359
5,0.722017,{'C': 239.5026619987486},"[[0.0, 0.0, -0.022108335231066492, 0.0, 0.0, 0...","[-4.846896874515453, 2.867533623934893, -9.329...",0.666667,0.666667,0.666667
6,0.716218,{'C': 239.5026619987486},"[[0.06112680630731129, 0.0, -0.005447359536467...","[-5.189769857439975, 3.1178896694440925, -9.01...",0.769231,0.769231,0.769231
7,0.684286,{'C': 0.04520353656360243},"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-0.602522149230779, 0.0, -1.5961054073651029]",0.74359,0.74359,0.74359
8,0.710084,{'C': 148.73521072935117},"[[0.0, 0.0, -0.00914618418506843, 0.0, 0.0, 0....","[-4.9491209185364475, 2.740744100666318, -8.78...",0.692308,0.692308,0.692308
9,0.701849,{'C': 148.73521072935117},"[[0.0, 0.0, -0.013032231204691692, 0.0, 0.0, 0...","[-4.807405422231132, 2.7603183322012392, -8.90...",0.74359,0.74359,0.74359


In [None]:
logistic_regression_results.to_csv("../data/sentiment_analysis/model_results/logistic_regression_l1_results.csv")

### Determine features with coef = 0

* First, compute the mean by feature and coefficient (40 experiments)
* Second, determine which values are smaller than 0 and 0.1

In [2]:
logistic_regression_results = pd.read_csv("/home/andrea/Desktop/neotec/data/sentiment_analysis/model_results/logistic_regression_l1_results.csv",index_col=0)
logistic_regression_results.head()

Unnamed: 0,best_score,best_parameters,coef,intercept,test_accuracy,test_micro_precision,test_micro_recall
0,0.647972,{'C': 0.04520353656360243},[[0. 0.03934381 0. ... 0. ...,[-1.18370975 -0.33386755 -0.97083267],0.671756,0.671756,0.671756
1,0.66755,{'C': 0.0727895384398315},[[ 0.00000000e+00 5.13121578e-02 0.00000000e...,[-1.49886997 -0.3273942 -1.20605966],0.580153,0.580153,0.580153
2,0.660727,{'C': 0.04520353656360243},[[ 0. 0. 0. ... 0. ...,[-1.20944275 -0.30899947 -0.98599682],0.641221,0.641221,0.641221
3,0.647943,{'C': 0.11721022975334805},[[ 0. 0.0902277 0. ... 0. ...,[-1.82021086 -0.35135356 -1.42741214],0.70229,0.70229,0.70229
4,0.661553,{'C': 0.04520353656360243},[[0.00000000e+00 1.13571951e-02 0.00000000e+00...,[-1.18436343 -0.32931244 -0.97655578],0.732824,0.732824,0.732824


In [32]:
coef = logistic_regression_results["coef"]

new_column = []
for string in coef:
    coef_by_label = string.replace("[","").replace("\n","").split("]")
    new_list = []
    for label in coef_by_label: 
        array = np.fromstring(label, dtype = float, sep = " ")
        new_list.append(array)
    new_column.append(np.array(new_list))

logistic_regression_results["np_coef"] = new_column

  array = np.fromstring(label, dtype = float, sep = " ")
  new_column.append(np.array(new_list))


In [34]:
logistic_regression_results["coef"][0]

'[[0.         0.03934381 0.         ... 0.         0.         0.        ]\n [0.         0.         0.         ... 0.         0.         0.        ]\n [0.09170395 0.         0.         ... 0.         0.         0.        ]]'

In [33]:
logistic_regression_results["np_coef"]

0     [[0.0, 0.03934381, 0.0], [0.0, 0.0, 0.0], [0.0...
1     [[0.0, 0.0513121578, 0.0], [-0.0374247734, -0....
2     [[0.0, 0.0, 0.0], [-0.00156394, 0.0, 0.0], [0....
3     [[0.0, 0.0902277, 0.0], [-0.07494604, -0.03269...
4     [[0.0, 0.0113571951, 0.0], [0.0, 0.0, 0.0], [0...
5     [[0.0, 0.03484231, 0.0], [0.0, 0.0, 0.0], [0.0...
6     [[0.0, 0.0419837312, 0.0], [0.0, 0.0, 0.0], [0...
7     [[0.0, 0.03814354, 0.0], [0.0, 0.0, 0.0], [0.0...
8     [[0.0, 0.0365590932, 0.0], [-0.0206300221, 0.0...
9     [[0.0, 0.02547558, 0.0], [0.0, 0.0, 0.0], [0.1...
10    [[-0.00910234, 0.02861082, 0.0], [0.0, 0.0, 0....
11    [[-0.000320095294, 0.0379561995, 0.0], [0.0, 0...
12    [[-0.00808221, 0.05424668, 0.0], [0.0, 0.0, 0....
13    [[-0.00143821427, 0.0910728057, -2.7476872e-05...
14    [[-0.0110739875, 0.0373363375, 0.0], [0.0, 0.0...
15    [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.09882255...
16    [[0.0, 0.02685073, 0.0], [0.0, 0.0, 0.0], [0.0...
17    [[0.05718538, 0.0, 0.0], [0.0, 0.0, 0.0], 

In [None]:
logistic_regression_results.head(2)

Unnamed: 0,best_score,best_parameters,coef,intercept,test_accuracy,test_micro_precision,test_micro_recall,coef_mean
0,0.698824,{'C': 385.6620421163472},"[[0.0, 0.0, -0.0023830418007437827, 0.0, 0.0, ...","[-5.082346893874195, 2.9860560085312216, -9.53...",0.717949,0.717949,0.717949,"[0.0, 0.0, -0.0007943472669145942, 0.0, 0.0, 0..."
1,0.70479,{'C': 239.5026619987486},"[[0.0, 0.0, -0.004314109056791422, 0.0, 0.0, 0...","[-4.632432313873381, 2.96831004991816, -9.4903...",0.74359,0.74359,0.74359,"[0.0, 0.0, 0.01210015267023373, 0.0, 0.0, 0.00..."


In [None]:
logistic_regression_results.to_csv("logistic_regression_l1_results.csv")

In [None]:
means = np.array(logistic_regression_results["coef_mean"])

In [None]:
final_mean = np.mean(means,axis = 0)

In [None]:
print(final_mean)

[-6.61309064e-04  2.14195790e-03  5.06517276e-03 ...  9.60572981e-05
  0.00000000e+00  1.81196362e-03]


In [None]:
feature_list = list(X.columns)
feature_mean_coef = [(feature, coef) for feature, coef in zip(feature_list, final_mean)]

In [None]:
feature_mean_coef

[('abajo', -0.0006613090635661546),
 ('abatido', 0.002141957900417625),
 ('abel', 0.005065172762745007),
 ('abelio', 0.0),
 ('abierto', 0.0),
 ('aborrecido', 0.0029791773595547843),
 ('abrazo', 0.0003949782357787207),
 ('abrir', 0.0004445024002828742),
 ('absoluto', 0.0),
 ('abuelo', 0.0),
 ('acabar', -0.012085385102590971),
 ('acalorado', 0.00010040854662337417),
 ('acaso', 0.0),
 ('acceder', 0.0),
 ('aceptar', -0.0003773924735035553),
 ('acerca', 0.0026357301431374),
 ('acercar', 0.001204830389750152),
 ('acero', 0.0024564645350709474),
 ('acidir', 0.00036070354416148744),
 ('aclarar', 0.0006734565044465376),
 ('acondicionado', 0.0022025578435568354),
 ('acordado', 6.77926833444989e-05),
 ('acordar', 0.0007216326096923473),
 ('acostado', 0.006613956497308659),
 ('acostar', -0.048310159939112605),
 ('actividad', 0.012731855626083258),
 ('activo', 0.0006236265465774914),
 ('actual', -0.00014829050057924668),
 ('actualmente', 0.00011502799317404268),
 ('acuerdo', 0.00019428886957794568)

In [None]:
np.save("average_coefficientes_per_feature.npy",feature_mean_coef)

### Search coefficients equal to 0

In [None]:
average_coeffs[0]

array(['abajo', '-0.0006613090635661546'], dtype='<U32')

In [None]:
null_coefs = []
for feature in average_coeffs:
    coef = feature[1]
    if coef == "0.0":
        null_coefs.append(feature)

In [None]:
len(null_coefs)

407

In [None]:
null_coefs

[array(['abelio', '0.0'], dtype='<U32'),
 array(['abierto', '0.0'], dtype='<U32'),
 array(['absoluto', '0.0'], dtype='<U32'),
 array(['abuelo', '0.0'], dtype='<U32'),
 array(['acaso', '0.0'], dtype='<U32'),
 array(['acceder', '0.0'], dtype='<U32'),
 array(['acusación', '0.0'], dtype='<U32'),
 array(['afectar', '0.0'], dtype='<U32'),
 array(['afonicar', '0.0'], dtype='<U32'),
 array(['ago', '0.0'], dtype='<U32'),
 array(['agoto', '0.0'], dtype='<U32'),
 array(['alegria', '0.0'], dtype='<U32'),
 array(['alguien', '0.0'], dtype='<U32'),
 array(['alimentario', '0.0'], dtype='<U32'),
 array(['alla', '0.0'], dtype='<U32'),
 array(['ambar', '0.0'], dtype='<U32'),
 array(['amiga', '0.0'], dtype='<U32'),
 array(['anadir', '0.0'], dtype='<U32'),
 array(['andado', '0.0'], dtype='<U32'),
 array(['anemia', '0.0'], dtype='<U32'),
 array(['angustia', '0.0'], dtype='<U32'),
 array(['animar', '0.0'], dtype='<U32'),
 array(['anotación', '0.0'], dtype='<U32'),
 array(['anotado', '0.0'], dtype='<U32'),
 a

In [None]:
np.save("null_coefficients_logistic_regression.npy",null_coefs)

### Search coefficients > 0.1

In [None]:
# mayores que el umbral 
coeffs_umbral = []

for feature in average_coeffs:
    coef = feature[1]
    if float(coef) > 0.1:
        coeffs_umbral.append(feature)

In [None]:
len(coeffs_umbral)

5

In [None]:
print(coeffs_umbral)

[array(['equivocado', '0.1310412847780708'], dtype='<U32'), array(['medias', '0.10315432840896847'], dtype='<U32'), array(['residencia', '0.15825370113632511'], dtype='<U32'), array(['velio', '0.11199049625543973'], dtype='<U32'), array(['vinir', '0.11194586188377345'], dtype='<U32')]


In [None]:
np.save("coeffients_greater_than_0.1_logistic_regression.npy",coeffs_umbral)

### Results

In [None]:
results = pd.read_csv("/home/andrea/Desktop/neotec/data/sentiment_analysis/model_results/logistic_regression_l1_mean_results.csv",index_col=0)
results.head(2)

Unnamed: 0,best_score,best_parameters,coef,intercept,test_accuracy,test_micro_precision,test_micro_recall,coef_mean
0,0.698824,{'C': 385.6620421163472},[[ 0. 0. -0.00238304 ... 0. ...,[-5.08234689 2.98605601 -9.53652849],0.717949,0.717949,0.717949,[ 0. 0. -0.00079435 ... 0. ...
1,0.70479,{'C': 239.5026619987486},[[ 0. 0. -0.00431411 ... 0.0...,[-4.63243231 2.96831005 -9.49038036],0.74359,0.74359,0.74359,[0. 0. 0.01210015 ... 0.002879...


In [None]:
average_coeffs = np.load("average_coefficients_per_feature_logistic_regression.npy")
average_coeffs = sorted(average_coeffs, key = lambda x: x[1], reverse = False)

In [None]:
average_coeffs

[array(['porrazo', '-0.00010754593027969802'], dtype='<U32'),
 array(['repitemar', '-0.00011210650156221297'], dtype='<U32'),
 array(['calculo', '-0.00011244764857170614'], dtype='<U32'),
 array(['causar', '-0.00011764508029424357'], dtype='<U32'),
 array(['impacto', '-0.00011989102153987268'], dtype='<U32'),
 array(['luz', '-0.00011993161794356227'], dtype='<U32'),
 array(['hilo', '-0.00012898052616447446'], dtype='<U32'),
 array(['caca', '-0.0001313912036002639'], dtype='<U32'),
 array(['char', '-0.00013260912123226078'], dtype='<U32'),
 array(['adormilado', '-0.00013329118642482621'], dtype='<U32'),
 array(['log', '-0.00013376717237309743'], dtype='<U32'),
 array(['ratito', '-0.00014523013597584424'], dtype='<U32'),
 array(['llameis', '-0.00014620440216202345'], dtype='<U32'),
 array(['actual', '-0.00014829050057924668'], dtype='<U32'),
 array(['comer', '-0.0001516168062566567'], dtype='<U32'),
 array(['impuesto', '-0.00015175425292779805'], dtype='<U32'),
 array(['menus', '-0.00015

In [None]:
null_coefs = np.load("null_coefficients_logistic_regression.npy")

In [None]:
for i in average_coeffs:
    if i[0] == "no":
        print(i)

['no' '-0.011404671822052442']


In [None]:
"no" in null_coefs

False

In [None]:
print(null_coefs)

[['abelio' '0.0']
 ['abierto' '0.0']
 ['absoluto' '0.0']
 ['abuelo' '0.0']
 ['acaso' '0.0']
 ['acceder' '0.0']
 ['acusación' '0.0']
 ['afectar' '0.0']
 ['afonicar' '0.0']
 ['ago' '0.0']
 ['agoto' '0.0']
 ['alegria' '0.0']
 ['alguien' '0.0']
 ['alimentario' '0.0']
 ['alla' '0.0']
 ['ambar' '0.0']
 ['amiga' '0.0']
 ['anadir' '0.0']
 ['andado' '0.0']
 ['anemia' '0.0']
 ['angustia' '0.0']
 ['animar' '0.0']
 ['anotación' '0.0']
 ['anotado' '0.0']
 ['antes' '0.0']
 ['anti' '0.0']
 ['apenar' '0.0']
 ['apoderado' '0.0']
 ['apoyo' '0.0']
 ['aquel' '0.0']
 ['armir' '0.0']
 ['atentar' '0.0']
 ['atento' '0.0']
 ['atrasado' '0.0']
 ['atroz' '0.0']
 ['autobus' '0.0']
 ['babel' '0.0']
 ['baja' '0.0']
 ['bajo' '0.0']
 ['balon' '0.0']
 ['bar' '0.0']
 ['barbara' '0.0']
 ['barbaridad' '0.0']
 ['barbaro' '0.0']
 ['barcelona' '0.0']
 ['bascula' '0.0']
 ['base' '0.0']
 ['beba' '0.0']
 ['belgio' '0.0']
 ['belio' '0.0']
 ['biologia' '0.0']
 ['bocata' '0.0']
 ['bolsa' '0.0']
 ['braga' '0.0']
 ['breve' '0.0']
 

In [None]:
coeffs_umbral = np.load("coeffients_greater_than_0.1_logistic_regression.npy")

In [None]:
print(coeffs_umbral)

[['equivocado' '0.1310412847780708']
 ['medias' '0.10315432840896847']
 ['residencia' '0.15825370113632511']
 ['velio' '0.11199049625543973']
 ['vinir' '0.11194586188377345']]
