In [1]:
# change default huggingface cache directory
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/hub"

# Import needed libraries 
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from transformers import BioGptModel, BioGptConfig, BioGptForCausalLM
from sentence_transformers import SentenceTransformer, util

import pandas as pd
import numpy as np
import scipy
import traceback
import scanpy as sc
import anndata as ad
import random

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, roc_auc_score, precision_recall_curve, classification_report
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, QuantileTransformer

#Scanorama Batch Normalization 
import scanorama

## Step 1: Generate Question + Answers from GPT 

trying BioMedLM model to see if I like it better 

In [2]:
# Prepare Data for Normalization 
# Reading the existing data into dataframes 
df_dichek = pd.read_csv('all_data_matrix_logNorm.csv', index_col=0,header=0).T
df_pedroza = pd.read_csv('pedroza_data_matrix_logNorm.csv',index_col=0,header=0).T

df_dichek_metadata = pd.read_csv('all_metadata.csv',index_col=0,header=0)
df_pedroza_metadata = pd.read_csv('pedroza_metadata.csv',index_col=0,header=0)

#Changing indices so that they match and the two data sets can be joined together 
df_dichek.index = df_dichek.index.str.replace('.','-')
df_pedroza.index = df_pedroza.index.str.replace('.', '-')

#make it so that df_dichek and df_pedroza have the same genes involved 
intersecting_genes = df_dichek.columns.intersection(df_pedroza.columns)
X_dichek = df_dichek[intersecting_genes]
X_pedroza = df_pedroza[intersecting_genes]

# Split metadata into the SHF = 0, CNC = 1 
y_dichek = (df_dichek_metadata['lineage']=='CNC').astype('int')
y_pedroza  = (df_pedroza_metadata['lineage'] == 'CNC').astype('int')
print("y_dichek shape: ",y_dichek.shape)
print("y_pedroza shape: ", y_pedroza.shape)

# Transform into np arrays 
np_dichek = X_dichek.to_numpy()
np_pedroza = X_pedroza.to_numpy()
datasets = [np_dichek, np_pedroza]

#Make genes list 
num_genes = intersecting_genes.size
genes_list = list()
for i in range(0, num_genes): 
    genes_list.append(intersecting_genes[i])
    
nested_list = list()
nested_list.append(genes_list.copy())
nested_list.append(genes_list.copy())

#Generate Corrected data
# integrated, corrected, genes = scanorama.correct(datasets, nested_list, return_dimred=True)

# #store batch corrected data in variables 
# df_batch_corrected_dichek = pd.DataFrame.sparse.from_spmatrix(corrected[0])
# df_batch_corrected_pedroza = pd.DataFrame.sparse.from_spmatrix(corrected[1])
# print(df_batch_corrected_dichek.shape)
# print(df_batch_corrected_pedroza.shape)
# integrated_dichek = integrated[0]
# integrated_pedroza = integrated[1]

y_dichek shape:  (15431,)
y_pedroza shape:  (9745,)


## Step 2: Find the data with the highest standard deviation 

In [3]:
X = pd.concat([X_dichek,X_pedroza]).to_numpy()
y = pd.concat([y_dichek, y_pedroza]).to_numpy()
print(X.shape)
print(y.shape)

(25176, 16855)
(25176,)


In [4]:
standard_devs = np.zeros(X.shape[1])
for i in range (0, X.shape[1]):
    standard_devs[i] = np.std(X.T[i])
indices = standard_devs.argsort() #indices sorted in reverse order 
print(indices)
print(indices.shape)
print(standard_devs[indices[0]])
print(standard_devs[indices[X.shape[1] - 1]])

[ 4609 10530  3785 ...   983 14921 11736]
(16855,)
0.0
1.4157742311629988


In [5]:
top_1000_indices = indices[X.shape[1] - 1000:]
top_1000_indices.sort()
print(top_1000_indices.shape)
X_top_1000 = pd.concat([X_dichek, X_pedroza]).iloc[:, top_1000_indices]
print(X_top_1000.shape)

(1000,)
(25176, 1000)


## Step 3: Create the array of strings 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")

In [None]:
model_name = "stanford-crfm/BioMedLM"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)

In [38]:
top_1000_genes = X_top_1000.columns
print(top_1000_genes)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
text_descriptions = list()
for i in range(200,300): 
    Query = f"""
    Question <1>: How is TGFb related to the development of aortic aneurysms in the ascending aorta of mice? 
    Answer <2>: TGFb plays both a protective and pathogenic role in the development of aortic aneurysms, as some studies cite that TGFb signaling is protective against thoracic and abdominal disease in mouse models while other studies note that increased activity can be lethal in utero
    Question <2>: How is {top_1000_genes[i]} related to the development of aortic aneurysms in the ascending aorta of mice? 
    Answer <2>: 
    """
    Query1 = f"""
    Question <1>: How is {top_1000_genes[i]} related to CNC and SHF cells in the aortas of mice? 
    Answer <1>: 
    """
    input_text = Query.strip()
    inputs = tokenizer(input_text, return_tensors='pt').to(device)
#     print({k: v.shape for k, v in inputs.items()})
    output_ids = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=200,
        top_p=0.95,
        temperature=0.7,
        bad_words_ids= tokenizer(["\\", ' [', "[", ']', "@", '=', '#', '^', '*', ' *', ' #', '{', ' {', '}', ' ([', '~', ' ~', '\n', ' <', '<', '>']).input_ids,
    )

    answer_ids = output_ids[0][len(inputs.input_ids[0]):]
    answer_text = tokenizer.decode(answer_ids)
    answer_text = answer_text.strip()
#     print(input_text)
#     print(answer_text)
    text_descriptions.append(answer_text)


In [None]:
print(len(text_descriptions))
print(text_descriptions[0])
print(text_descriptions[99])
pd.DataFrame(text_descriptions, columns=['text_descriptions']).to_csv("data/top_1000_genes_cells200-300_Query1_answers.csv")

In [6]:
file_names = ["data/top_1000_genes_cells100-200_Query1_answers.csv", "data/top_1000_genes_cells200-300_Query1_answers.csv", "data/top_1000_genes_cells300-400_Query1_answers.csv", "data/top_1000_genes_cells400-500_Query1_answers.csv", "data/top_1000_genes_cells500-600_Query1_answers.csv", "data/top_1000_genes_cells600-700_Query1_answers.csv", "data/top_1000_genes_cells300-400_Query1_answers.csv", "data/top_1000_genes_cells700-800_Query1_answers.csv", "data/top_1000_genes_cells800-900_Query1_answers.csv", "data/top_1000_genes_cells900-1000_Query1_answers.csv"]
gene_descriptions_Query1 = pd.read_csv("data/top_1000_genes_cells0-100_Query1_answers.csv")
for i in range(0, len(file_names)):
    new_descripts = pd.read_csv(file_names[i])
    gene_descriptions_Query1 = pd.concat([gene_descriptions_Query1, new_descripts])
print(gene_descriptions_Query1.iloc[200])

Unnamed: 0                                                           0
text_descriptions    Nexn is a component of the extracellular matri...
Name: 0, dtype: object


In [7]:
#Generate Embeddings 
model = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')
embeddings = list()
for i in range(0,1000): 
    embeddings.append(model.encode(gene_descriptions_Query1.iloc[i][1]))

In [8]:
similarity_matrix = np.zeros([1000,1000])
for i in range(0,1000):
    for j in range(0,1000):
        similarity = util.pytorch_cos_sim(embeddings[i], embeddings[j])
        similarity_matrix[i][j] = similarity[0][0]
print(similarity_matrix)

[[1.         0.64041317 0.65253288 ... 0.67523646 0.69934809 0.71627951]
 [0.64041317 1.         0.66670632 ... 0.56841803 0.58007854 0.57984048]
 [0.65253288 0.66670632 0.99999982 ... 0.6492011  0.60106933 0.57205641]
 ...
 [0.67523646 0.56841803 0.6492011  ... 1.00000012 0.54780734 0.54762679]
 [0.69934809 0.58007854 0.60106933 ... 0.54780734 1.00000012 0.66877347]
 [0.71627951 0.57984048 0.57205641 ... 0.54762679 0.66877347 0.99999994]]


In [9]:
# Save similarity matrix as a dataframe object 
df_similarity_matrix = pd.DataFrame(similarity_matrix, columns = X_top_1000.columns)
df_similarity_matrix.to_csv('Query1_similarity_matrix.csv')

## Step 4: Normalize data using similarity matrix 
Now, we use the generated similarity matrix to normalize the baseline data. 

In [15]:
# Read similarity matrix 
similarity_matrix = pd.read_csv('Query1_similarity_matrix.csv').to_numpy()
similarity_matrix = np.delete(similarity_matrix, 0, 1)
print(similarity_matrix.shape)
print(similarity_matrix)

(1000, 1000)
[[1.         0.64041317 0.65253288 ... 0.67523646 0.69934809 0.71627951]
 [0.64041317 1.         0.66670632 ... 0.56841803 0.58007854 0.57984048]
 [0.65253288 0.66670632 0.99999982 ... 0.6492011  0.60106933 0.57205641]
 ...
 [0.67523646 0.56841803 0.6492011  ... 1.00000012 0.54780734 0.54762679]
 [0.69934809 0.58007854 0.60106933 ... 0.54780734 1.00000012 0.66877347]
 [0.71627951 0.57984048 0.57205641 ... 0.54762679 0.66877347 0.99999994]]


In [61]:
# Create a random matrix 
type1 = 'random matrix'
random_matrix = np.random.rand(1000,1000)
print(random_matrix.shape)

type2 = 'original similarity matrix'

# Try different types of normalization for the similarity matrix 
type3= 'MaxAbsScaler'
transformer = MaxAbsScaler().fit(similarity_matrix)
transformer
sim_matrix_MaxAbsScaler = transformer.transform(similarity_matrix)
print(sim_matrix_MaxAbsScaler.shape)

type4 = 'MinMaxScaler'
scaler = MinMaxScaler()
scaler.fit(similarity_matrix)
sim_matrix_MinMaxScaler = scaler.transform(similarity_matrix)
print(sim_matrix_MinMaxScaler.shape)

type5 = 'StandardScaler'
scaler = StandardScaler()
scaler.fit(similarity_matrix)
sim_matrix_StandardScaler = scaler.transform(similarity_matrix)
print(sim_matrix_StandardScaler.shape)

type6 = 'QuantileTransformer'
qt = QuantileTransformer(n_quantiles=10, random_state=0)
sim_matrix_QuantileTransformer = qt.fit_transform(similarity_matrix)
print(sim_matrix_QuantileTransformer.shape)

similarityMatrices = [random_matrix, similarity_matrix, sim_matrix_MaxAbsScaler, sim_matrix_MinMaxScaler, sim_matrix_StandardScaler, sim_matrix_QuantileTransformer]
normalizationTypes = [type1, type2, type3, type4, type5, type6, 'baseline data']

(1000, 1000)
(1000, 1000)
(1000, 1000)
(1000, 1000)
(1000, 1000)


In [62]:
list_X_normalized = list()
for i in range(0, len(similarityMatrices)):
    list_X_normalized.append(np.matmul(X_top_1000.to_numpy(), similarityMatrices[i]))
list_X_normalized.append(X_top_1000)

7
QuantileTransformer


In [7]:
X = np.random.rand(X_top_1000.shape[0], X_top_1000.shape[1])
y = pd.concat([y_dichek, y_pedroza])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
rf = RandomForestClassifier() 
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

y_prob_new = np.empty((y_prob.shape[0], ))
for j in range(0, y_prob.shape[0]):
    y_prob_new[j] = y_prob[j][1]

# Accuracy Classification Score 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_prob_new)
classification = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC :", auc_score)
print(classification)

Accuracy: 0.5559968228752978
Precision: 0.5742441209406495
Recall: 0.884442911348741
ROC AUC : 0.5017374845504469
              precision    recall  f1-score   support

           0       0.41      0.11      0.17      2137
           1       0.57      0.88      0.70      2899

    accuracy                           0.56      5036
   macro avg       0.49      0.50      0.44      5036
weighted avg       0.51      0.56      0.47      5036



In [13]:
# Try comparing the for the spearman correlation 
gene_similarity_matrix = np.zeros((1000,1000))
X_T = X_top_1000.to_numpy().T
for i in range(0,1000):
    for j in range(0,1000): 
        gene_similarity_matrix[i][j] = util.pytorch_cos_sim(X_T[i], X_T[j])

In [22]:
gene_sim_T = gene_similarity_matrix.T
sim_T = similarity_matrix.T
spearman_vals = list()
vals = np.zeros(1000)
for i in range(0,1000):
    spearman_vals.append(scipy.stats.spearmanr(gene_sim_T[i], sim_T[i]))
    vals[i] = scipy.stats.spearmanr(gene_sim_T[i], sim_T[i])[0]
print(np.mean(vals))

0.029315228842511462


In [66]:
#Train using the normalized data 
for i in range(0, len(list_X_normalized)):
    X = list_X_normalized[i]
    y = pd.concat([y_dichek, y_pedroza])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    rf = RandomForestClassifier() 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)

    print(normalizationTypes[i])
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC AUC :", auc_score)
    print(classification)

random matrix
Accuracy: 0.7720413026211279
Precision: 0.7552941176470588
Recall: 0.8904299583911235
ROC AUC : 0.8555929973859108
              precision    recall  f1-score   support

           0       0.81      0.61      0.70      2152
           1       0.76      0.89      0.82      2884

    accuracy                           0.77      5036
   macro avg       0.78      0.75      0.76      5036
weighted avg       0.78      0.77      0.77      5036

original similarity matrix
Accuracy: 0.6767275615567911
Precision: 0.6899390243902439
Recall: 0.7874043145441892
ROC AUC : 0.7282747101996464
              precision    recall  f1-score   support

           0       0.65      0.53      0.58      2162
           1       0.69      0.79      0.74      2874

    accuracy                           0.68      5036
   macro avg       0.67      0.66      0.66      5036
weighted avg       0.67      0.68      0.67      5036

MaxAbsScaler
Accuracy: 0.6685861795075456
Precision: 0.6787234042553192
Rec

In [67]:
#Train using baseline dataset and the normalized dataset together 
y = pd.concat([y_dichek,y_pedroza, y_dichek, y_pedroza]).to_numpy()
for i in range(0, len(list_X_normalized) -1):
    X = np.append(X_top_1000, list_X_normalized[i], axis = 0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    rf = RandomForestClassifier() 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)

    print(normalizationTypes[i])
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC AUC :", auc_score)
    print(classification)

#Try using concatenation of both, use original and augmented data matrix 
# For augmeneted dataset, try to do a smaller training set, 30 train 70 test

random matrix
Accuracy: 0.8515539668354681
Precision: 0.8418247374196582
Recall: 0.9170081967213115
ROC AUC : 0.9310594035418652
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      4215
           1       0.84      0.92      0.88      5856

    accuracy                           0.85     10071
   macro avg       0.86      0.84      0.84     10071
weighted avg       0.85      0.85      0.85     10071

original similarity matrix
Accuracy: 0.7945586337007249
Precision: 0.7863831137914616
Recall: 0.8726664318421979
ROC AUC : 0.8728552789842178
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      4393
           1       0.79      0.87      0.83      5678

    accuracy                           0.79     10071
   macro avg       0.80      0.78      0.79     10071
weighted avg       0.80      0.79      0.79     10071

MaxAbsScaler
Accuracy: 0.8024029391321617
Precision: 0.806574559313959
Reca

## Step 6: Try using sentiment Analysis for Filtering
First, I will use the results from sentiment analysis to filter the data and choose only the genes with a positive sentiment analysis 

In [7]:
# Perform Sentiment Analysis on the results that were generated here 
sentiment_pipeline = pipeline("sentiment-analysis")
data = list()
for i in range(0, 1000):
    data.append(gene_descriptions_Query1.iloc[i][1])
sentiments = sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [10]:
positiveSentis_indices = list()
for i in range(0,1000):
    if(sentiments[i]['label'] == 'POSITIVE'):
        positiveSentis_indices.append(i)
print(positiveSentis_indices)
print("positiveSentis_indices length: " , len(positiveSentis_indices))
for i in range(3):
    print("Positive Sentiment Sentence:", gene_descriptions_Query1.iloc[positiveSentis_indices[i]][1])

for i in range(3):
    print("Negative Sentiment Sentence:", gene_descriptions_Query1.iloc[positiveSentis_indices[i] + 1][1])

[11, 31, 53, 67, 69, 85, 86, 104, 134, 149, 185, 186, 189, 190, 196, 202, 203, 246, 252, 254, 259, 265, 269, 277, 292, 297, 303, 304, 320, 323, 340, 355, 361, 384, 393, 402, 407, 414, 415, 417, 431, 458, 460, 461, 463, 477, 483, 488, 489, 495, 496, 502, 511, 523, 538, 541, 554, 558, 567, 577, 615, 627, 631, 644, 649, 654, 663, 668, 694, 696, 703, 704, 720, 723, 740, 755, 761, 784, 793, 809, 820, 829, 835, 853, 863, 896, 909, 919, 929, 931, 935, 944, 949, 963, 964, 977, 982, 985, 988, 989, 996]
positiveSentis_indices length:  101
Positive Sentiment Sentence: Hspd1 may play a role in the development of aortic aneurysms through its ability to interact with the TGFb pathway. Hspd1 may also be regulated by TGFb, which in turn may be regulated by Hspd1. Hspd1 may be a biomarker for TGFb activity in the aorta. Hspd1 may also be a therapeutic target for treatment of aortic aneurysms, as Hspd1 is a chaperone, which in turn can affect TGFb activity and potentially prevent or delay aneurysm forma

In [42]:
#Subset data based on genes indicated by sentiment analysis 
X_sentiment = X_top_1000.to_numpy().T[positiveSentis_indices].T
print(X_sentiment.shape)

list_sentiment_auc = np.zeros((4, 10 ))

for i in range (10):
    X = X_sentiment
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    rf = RandomForestClassifier() 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)

#     print("Sentiment Analysis")
#     print("Accuracy:", accuracy)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("ROC AUC :", auc_score)
#     print(classification)
    list_sentiment_auc[0][i] = accuracy
    list_sentiment_auc[1][i] = precision
    list_sentiment_auc[2][i] = recall
    list_sentiment_auc[3][i] = auc_score

(25176, 101)


In [41]:
# Compare against randomly selecting 100 genes and using that for training 
rand_indices = random.sample(range(0,1000), 101)
rand_indices.sort()
print(rand_indices)

list_random_auc = np.zeros((4, 10))

for i in range(10):
    X_rand_100 = X_top_1000.to_numpy().T[rand_indices].T
    print(X_rand_100.shape)

    X = X_rand_100
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    rf = RandomForestClassifier() 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)

#     print("Sentiment Analysis")
#     print("Accuracy:", accuracy)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("ROC AUC :", auc_score)
#     print(classification)
    
    list_random_auc[0][i] = accuracy
    list_random_auc[1][i] = precision
    list_random_auc[2][i] = recall
    list_random_auc[3][i] = auc_score


[3, 12, 13, 32, 47, 49, 70, 73, 86, 89, 93, 100, 101, 103, 106, 113, 131, 137, 139, 140, 158, 166, 170, 172, 173, 177, 178, 182, 185, 202, 208, 212, 225, 226, 232, 250, 255, 267, 282, 307, 315, 321, 345, 352, 356, 383, 391, 408, 432, 435, 437, 440, 445, 453, 459, 461, 466, 472, 473, 477, 499, 516, 518, 541, 563, 572, 579, 581, 586, 587, 592, 605, 612, 639, 669, 671, 688, 710, 712, 714, 730, 737, 762, 776, 779, 797, 801, 808, 824, 855, 862, 873, 911, 918, 923, 946, 965, 986, 991, 992, 993]
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)
(25176, 101)


In [43]:
# calculating significance using a paired t-test 
types = ["accuracy", "precision", "recall", "roc auc"]

for i in range (4): 
    print(types[i])
    print(scipy.stats.ttest_rel(list_random_auc[i], list_sentiment_auc[i]))
    
print(types)
print('random')
print(list_random_auc)
print('sentiment')
print(list_sentiment_auc)

accuracy
TtestResult(statistic=-43.24711821697303, pvalue=9.434679543257187e-12, df=9)
precision
TtestResult(statistic=-34.7263393654471, pvalue=6.725565192245215e-11, df=9)
recall
TtestResult(statistic=-16.743416052360168, pvalue=4.3263428211678377e-08, df=9)
roc auc
TtestResult(statistic=-40.32777067214882, pvalue=1.764524557329655e-11, df=9)
['accuracy', 'precision', 'recall', 'roc auc']
random
[[0.7680699  0.76409849 0.76687847 0.76310564 0.77084988 0.77065131
  0.76211279 0.76528991 0.77680699 0.77442415]
 [0.76466936 0.74893358 0.75702934 0.76080957 0.76186131 0.76346389
  0.75901038 0.75873595 0.76504914 0.77084601]
 [0.85729203 0.87101347 0.86729692 0.85729095 0.87100139 0.8672228
  0.85867312 0.86551127 0.87403509 0.86865569]
 [0.83418186 0.84084089 0.83758248 0.83948849 0.84855469 0.84442045
  0.83408161 0.83704593 0.84823045 0.8405332 ]]
sentiment
[[0.87112788 0.87609214 0.875695   0.8723193  0.86119936 0.8683479
  0.87807784 0.87609214 0.87728356 0.87728356]
 [0.86328125 0.

The analysis here seems to yield fairly good results, with random selection underperforming with respect to selecting the top 101 randomly. 

## Step 6: Normalize using the results from the sentiment analysis 

In [57]:
# Normalize scores so they are either in the 0,1 range 
sentiment_scores = list()
for i in range(0, len(sentiments)):
    val = sentiments[i]['score']
    if (sentiments[i]['label'] == 'NEGATIVE'):
        val = val * -1
    sentiment_scores.append((val /2) + 0.5)
print(sentiment_scores[0:5])

[0.0007914304733276367, 0.0016494691371917725, 0.06643381714820862, 0.19121968746185303, 0.00024753808975219727]


In [65]:
# Normalize data using these sentiment scores 
X_normalized_sentiment = X_top_1000.to_numpy()
for i in range(0, len(sentiment_scores)):
    X_normalized_sentiment[:, i] *= sentiment_scores[i]

In [66]:
X = X_normalized_sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
rf = RandomForestClassifier() 
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

y_prob_new = np.empty((y_prob.shape[0], ))
for j in range(0, y_prob.shape[0]):
    y_prob_new[j] = y_prob[j][1]

# Accuracy Classification Score 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_prob_new)
classification = classification_report(y_test, y_pred)

print("Sentiment Analysis")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC :", auc_score)
print(classification)

Sentiment Analysis
Accuracy: 0.8562351072279587
Precision: 0.8317757009345794
Recall: 0.9355290819901892
ROC AUC : 0.9354740030715731
              precision    recall  f1-score   support

           0       0.90      0.75      0.82      2182
           1       0.83      0.94      0.88      2854

    accuracy                           0.86      5036
   macro avg       0.87      0.84      0.85      5036
weighted avg       0.86      0.86      0.85      5036

