In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import euclidean
from IPython.core.pylabtools import figsize
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import matplotlib.patches as mpatches
import sklearn.neighbors as skl_nb
from sklearn.cluster import KMeans
from operator import itemgetter
import matplotlib.pyplot as plt
from textwrap import wrap
from numpy import argmax
from numpy import array
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import pickle

# The Big Bang Theory analysis

In [None]:
df = pd.read_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/data_300pca.pkl')

## Part1: PCA study

### PCA1 vs PCA2

In [None]:
num_clusters = 10
X = df.loc[:,'PCA1':'PCA2']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):
    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])

centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)


centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]


In [None]:
fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')

legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    
plt.xlim(-11.5,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)
x_text = 'PCA1: Short phrase to phrase that include \"Sheldon \"' #for pca1
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)


y_text = 'PCA2: Long phrase about a female character \n to short phrase about \"Sheldon\" ' # for pca2
plt.ylabel(y_text, ha='center', labelpad=35, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)

#plt.savefig('PCA1_2.png',bbox_inches='tight')
plt.show()

#### Average Position (of PCA1 vs PCA2)

In [None]:
sheldon =df[df['Person'].str.contains('Sheldon')]
sheldon_mean_pca1_index = sheldon['PCA1'].mean()
sheldon_mean_pca2_index = sheldon['PCA2'].mean()

leonard =df[df['Person'].str.contains('Leonard')]
leonard_mean_pca1_index = leonard['PCA1'].mean()
leonard_mean_pca2_index = leonard['PCA2'].mean()

penny =df[df['Person'].str.contains('Penny')]
penny_mean_pca1_index = penny['PCA1'].mean()
penny_mean_pca2_index = penny['PCA2'].mean()

howard =df[df['Person'].str.contains('Howard')]
howard_mean_pca1_index = howard['PCA1'].mean()
howard_mean_pca2_index = howard['PCA2'].mean()

raj =df[df['Person'].str.contains('Raj')]
raj_mean_pca1_index = raj['PCA1'].mean()
raj_mean_pca2_index = raj['PCA2'].mean()

bernadette =df[df['Person'].str.contains('Bernadette')]
bernadette_mean_pca1_index = bernadette['PCA1'].mean()
bernadette_mean_pca2_index = bernadette['PCA2'].mean()

amy =df[df['Person'].str.contains('Amy')]
amy_mean_pca1_index = amy['PCA1'].mean()
amy_mean_pca2_index = amy['PCA2'].mean()

text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]

colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]

df2 = X.values

fig = plt.figure(figsize=(14,10))
ax = fig.add_subplot(111)

for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)



legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))

#ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=11, title_fontsize=13)


plt.annotate('Sheldon', (sheldon_mean_pca1_index,sheldon_mean_pca2_index), xytext=(10,-10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"), fontsize=16)

plt.annotate('Leonard', (leonard_mean_pca1_index, leonard_mean_pca2_index), xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Penny', (penny_mean_pca1_index, penny_mean_pca2_index), xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Howard', (howard_mean_pca1_index ,howard_mean_pca2_index ), xytext=(-70,2),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Raj', (raj_mean_pca1_index,raj_mean_pca2_index), xytext=(10,9),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Bernadette', (bernadette_mean_pca1_index, bernadette_mean_pca2_index), xytext=(10,11),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Amy', (amy_mean_pca1_index, amy_mean_pca2_index),xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.xlim(-1,2)
plt.ylim(-0.6,0.75) 

plt.title('Average position for each character')

plt.xlabel(x_text, ha='center', labelpad=20, fontsize=14)

plt.ylabel(y_text, ha='center', labelpad=20, fontsize=14)

plt.savefig('average position.png',bbox_inches='tight')

plt.show()

### PCA3 vs PCA4

In [None]:
num_clusters = 10
X = df.loc[:,'PCA3':'PCA4']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):
    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])

centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)


centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]

In [None]:
fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')

legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    
plt.xlim(-11.5,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)

x_text = 'PCA3: Phrase that question a premise to phrase with a first person future action'
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)

y_text = 'PCA4: Phrase about relationship to phrase related to food'
plt.ylabel(y_text, ha='center', labelpad=35, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)

plt.savefig('PCA3_4.png',bbox_inches='tight')
plt.show()


### PCA5 vs PCA6

In [None]:
num_clusters = 10
X = df.loc[:,'PCA5':'PCA6']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):

    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])
    
centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)


centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]


In [None]:
fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')

legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    

        
plt.xlim(-11.5,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)

x_text = 'PCA5: Discussion with often a negation to short question about a woman'
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)

y_text = 'PCA6: Phrase with an apology to phrase with affirmative statement '
plt.ylabel(y_text, ha='center', labelpad=35, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)

plt.savefig('PCA5_6.png',bbox_inches='tight')
plt.show()

## Part2: Logistic regression

### Roc-au-score and Accuracy

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def logistic_function(dataset, number_pca):
    pca = 'PCA'+str(i)
    X = df.loc[:,'PCA1':pca]
    X = scaled(X)
    y = integer_encoded
    y = np.squeeze(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)

    model = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    roc_return = roc_auc_score(y_test, model.decision_function(X_test), multi_class='ovr')
    return roc_return


def logistic_function_accuracy(dataset, number_pca):
    pca = 'PCA'+str(i)
    X = df.loc[:,'PCA1':pca]
    X = scaled(X)
    y = integer_encoded
    y = np.squeeze(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)

    model = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    
    return accuracy

def scaled(X):
    stand = X.std(axis=0)
    min_col = X.min(axis=0)
    X = X.values
    min_col = X.min(axis=1)
    for pca in range(len(X[0,:])):
        for row in range(len(X[:,0])):
            X[row,pca] = (X[row,pca]-min_col[pca])/stand[pca]
    return X

#### Creation dataset

In [None]:
Sheldon =  df.loc[df['Person']== 'Sheldon']
#Leonard = df.loc[df['Person']== 'Leonard']
#df = pd.concat([Sheldon, Leonard], ignore_index=True)

Penny = df.loc[df['Person']== 'Penny']
df = pd.concat([Sheldon, Penny], ignore_index=True)

print(df.pivot_table(index = ['Person'], aggfunc ='size'))

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = df['Person']
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print('integer_encoded: ',integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)


#### ROC-AU-SCORE

In [None]:
roc = []
for i in range(2,301,1):
    print('process until PCA',i)
    roc.append(logistic_function(df, i))
    print('done')

np.save('array_Ross_Phoebe.npy', roc)

In [None]:
array_SP = np.load('array_Sheldon_Penny.npy')
array_SL = np.load('array_Sheldon_Leonard.npy')


In [None]:
x = np.arange(2,301)
y1 = np.asarray(array_SP)
y2 = np.asarray(array_SL)

fig, ax = plt.subplots()
plt.ylim(0.51,0.8) 
line1, = ax.plot(x, y1, '#9467bd', label='Sheldon vs Penny')
line2, = ax.plot(x, y2, '#17becf', label='Sheldon vs Leonard')
ax.legend(handles=[line1, line2], loc = 'lower right')
plt.xlabel('Number of PCA')
plt.ylabel('AUC')
plt.savefig('TBBT_auc.png')
plt.show()

#### Accuracy

In [None]:
acc = []
for i in range(2,301,1):
    print('process until PCA',i)
    acc.append(logistic_function_accuracy(df, i))
    print('done')
    

In [None]:
np.save('accuracy_Sheldon_Penny.npy', acc)

In [None]:
accuracy_SP = np.load('accuracy_Sheldon_Penny.npy')
accuracy_SL = np.load('accuracy_Sheldon_Leonard.npy')


In [None]:
x = np.arange(2,301)
y1 = np.asarray(accuracy_SP)
y2 = np.asarray(accuracy_SL)

fig, ax = plt.subplots()
plt.ylim(0.51,0.8) 
line1, = ax.plot(x, y1, '#9467bd', label='Sheldon vs Penny')
line2, = ax.plot(x, y2, '#17becf', label='Sheldon vs Leonard')
ax.legend(handles=[line1, line2], loc = 'lower right')
plt.xlabel('Number of PCA')
plt.ylabel('Accuracy')
plt.savefig('TBBT_accuracy.png')
plt.show()

## Part3: Heat table and graph

In [None]:
import itertools
# Creation of the binome
personnes = ['Amy','Penny','Bernadette','Sheldon', 'Leonard', 'Raj', 'Howard']
couples = list(itertools.combinations(personnes, 2))

for couple in couples:
    print(couple)

In [None]:
def create_dataset(dataset, perso1, perso2):
    character1 =  dataset.loc[dataset['Person']== perso1]
    character2 =  dataset.loc[dataset['Person']== perso2]
    df = pd.concat([character1, character2], ignore_index=True)
    return df


roc =[]
coef_matrix = np.zeros((len(couples),300))
model_intercept = np.zeros(len(couples))
i=0
for couple in couples:
    print(str(couple[0]), str(couple[1]))
    df_t = create_dataset(df, str(couple[0]), str(couple[1]))

    data = df_t['Person']
    values = array(data)
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    y = integer_encoded
    y = np.squeeze(y)
    
    X = df_t.loc[:,'PCA1':'PCA300']
    stand = X.std(axis=0)
    mean_col = X.mean(axis=0)
    X = X.values

    for pca in range(len(X[0,:])):
        for row in range(len(X[:,0])):
            X[row,pca] = (X[row,pca]-mean_col[pca])/stand[pca]
    




    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)


    model = LogisticRegression(solver='lbfgs').fit(X_train, y_train)

    coef = model.coef_
    coef_matrix[i]= coef
    intercept=model.intercept_
    model_intercept[i] = intercept[0]
    classes=model.classes_
    
    #AUC
    y_pred = model.predict(X_test)
    roc_score = roc_auc_score(y_test, model.decision_function(X_test), multi_class='ovr')
    temp = [str(couple[0]), str(couple[1]), roc_score]
    roc.append(temp)
    i+=1

In [None]:
sorted(roc, key=itemgetter(2))

In [None]:
data=[]
for couple in couples:
    temp = str(couple[0])+ ' and ' +str(couple[1])
    data.append(temp)
column_names = [f'PCA{i}' for i in range(1, 301)]
df_matrix = pd.DataFrame(coef_matrix, columns=column_names)
df_matrix['Pairs'] = data
#df_matrix.to_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/coef_matrix_TBBT_solverlbfgs_unbalance.pkl')

In [None]:
df_matrix = pd.read_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/coef_matrix_TBBT_solverlbfgs_unbalance.pkl')

In [None]:
 '''Here we would like to create a dataset such that we read each row
 will take the absolute value of the values and save the 10st values in a 
 matrix, and the values of the te corresponding pca in a list'''
df_subset = df_matrix.loc[:, 'PCA1':'PCA300']
data = np.zeros((len(df_subset),10))

text=[]
for row in np.arange(len(df_matrix)):
    selected_row = np.abs(df_subset.iloc[row])
    top_10_columns = selected_row.nlargest(10)
    data[row] =df_subset.loc[row][top_10_columns.index]
    text.append(top_10_columns.index)


column_names = [f'Column{i}' for i in range(1, 11)]
df2 = pd.DataFrame(data, columns=column_names)
df2['Pairs']= df_matrix['Pairs']


# Here we want to sort the value of the 1st column
df2 = df2.sort_values('Column1',ignore_index=True)

data2 = df2.loc[:, 'Column1':'Column10']

In [None]:
data_list = []
for i in np.arange(len(data)):
    list_10 = []
    for j in np.arange(len(data[0,:])):
        list_temp = [data[i,j], text[i][j]] # we associate each cell to the corresponding PCA index before the sorting
        list_10.append(list_temp)
    data_list.append(list_10)


sort_list = sorted(data_list, key=itemgetter(0))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6)) 
cmap= sns.diverging_palette(220, 20, as_cmap=True)
maximum = data2['Column1'].abs().max()
heatmap = sns.heatmap(data2, yticklabels=df2['Pairs'], xticklabels=[], linewidths=.5, ax=ax, cmap=cmap, vmin=-maximum, vmax=maximum)

# Annotate each cell with the corresponding PCA label
for i in range(len(df_subset)):
    for j in range(10):
        cell_label = sort_list[i][j][1]
        ax.text(j + 0.5, i + 0.5, cell_label, ha='center', va='center', fontsize=8, color= 'white', fontweight = 'bold')
#plt.savefig('heatmap_friends_abs_top10.png',bbox_inches='tight')

plt.show()

In [None]:
def split_names(names):
    name_list = names.split(" and ")
    name_list = [name.strip() for name in name_list]
    return name_list


pair = []
for i in np.arange(len(df2)):
        couple = df2['Pairs'][i]
        couple = split_names(couple)
        temp = (couple[0], couple[1])
        temp = 'P('+ str(couple[0])+ '|' + str(couple[0]) + ' or ' + str(couple[1]) + ')' 
        pair.append(temp)
        print(temp)

        
df2['Pairs'] = pair

In [None]:
fig, ax = plt.subplots(figsize=(10, 6)) 
cmap= sns.diverging_palette(220, 20, as_cmap=True)
maximum = data2['Column1'].abs().max()
heatmap = sns.heatmap(data2, yticklabels=df2['Pairs'], xticklabels=[], linewidths=.5, ax=ax, cmap=cmap, vmin=-maximum, vmax=maximum)

# Annotate each cell with the corresponding PCA label
for i in range(len(df_subset)):
    for j in range(10):
        cell_label = sort_list[i][j][1]
        ax.text(j + 0.5, i + 0.5, cell_label, ha='center', va='center', fontsize=8, color= 'white', fontweight = 'bold')
plt.savefig('heatmap_Friends_abs_top10_2.png',bbox_inches='tight')

plt.show()

In [None]:
def count_occurrences(text_list):
    counts = {}
    for item in text_list:
        if item in counts:
            counts[item] += 1
        else:
            counts[item] = 1
    return counts

list_pca = []
for i in range(len(df_subset)):
    for j in range(10):
        list_pca.append(sort_list[i][j][1])

result = count_occurrences(list_pca)
result = dict(sorted(result.items(), key=lambda item: item[1],reverse=True))

for item, count in result.items():
    print(f"{item}: {count} occurrence(s)")
    
for pca in list_pca:
    print('\subsection{',pca,'}')
    
    print('top values')
    print('\ begin{dialogue}')
    x = df.nlargest(20,pca)
    for i in range(len(x)):
        print('\speak{',x.iloc[i]['Person'],'}',x.iloc[i]['Said'])
    print('\end{dialogue}')
    print()
    
    print('lowest values')
    print('\ begin{dialogue}')
    x = df.nsmallest(20,pca)
    for i in range(len(x)):
        print('\speak{',x.iloc[i]['Person'],'}',x.iloc[i]['Said'])
    print('\end{dialogue}')
    print()
    print()

In [None]:
import networkx as nx

def split_names(names):
    name_list = names.split(" and ")
    name_list = [name.strip() for name in name_list]
    return name_list



G = nx.DiGraph()

# Add nodes
node = personnes
G.add_nodes_from(node)

coefficients={}
edges = []
for i in np.arange(len(df_matrix)):
    coeff = df_matrix['PCA19'][i] #9 for friends and 19 for TBBT
    print(coeff)
    if np.abs(coeff) <= 0.1:
        i+=1
    else:
        couple = split_names(df_matrix['Pairs'][i])
        temp = (couple[0], couple[1], coeff )
        edges.append(temp)


G.add_weighted_edges_from(edges)

# Create lists of edges and labels for positive and negative weights
positive_edges = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] >= 0]
negative_edges = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] < 0]
edge_labels = {(u, v): f"{w:.2f}" if w >= 0 else f"{-w:.2f}" for (u, v, w) in G.edges(data='weight')}

# Draw the graph
fig, ax = plt.subplots(figsize=(10, 8))  # Adjust the size of the plot
pos = nx.spring_layout(G, k=3)

nx.draw(G, pos, with_labels=True, edgelist=positive_edges, node_size=2500, node_color="skyblue", 
        font_size=9, arrows=True, arrowstyle= '-|>')

nx.draw(G, pos, with_labels=True, edgelist=negative_edges,node_size=2500, node_color="skyblue", 
        font_size=9, arrows=True, arrowstyle= '<|-')
        

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
#plt.savefig("graphTBBT.png")

# Friends analysis

In [None]:
df = pd.read_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/data_300pca_friends2.pkl')

## Part1: PCA study

### PCA1 vs PCA2

In [None]:
num_clusters = 10
X = df.loc[:,'PCA1':'PCA2']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):
    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])

In [None]:
centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)


centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]


fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')

legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    
plt.xlim(-10,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)

x_text = 'PCA1: From phrase that include a name to \" Hey \" '
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)

y_text = 'PCA2: From phrase that include "yeah\" to \" Hi \"'
plt.ylabel(y_text, ha='center', labelpad=30, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)

plt.savefig('PCA1_2_FA.png',bbox_inches='tight')




plt.show()

#### Average position (PCA1 vs PCA2)

In [None]:
chandler =df[df['Person'].str.contains('Chandler')]
chandler_mean_pca1_index = chandler['PCA1'].mean()
chandler_mean_pca2_index = chandler['PCA2'].mean()

joey =df[df['Person'].str.contains('Joey')]
joey_mean_pca1_index = joey['PCA1'].mean()
joey_mean_pca2_index = joey['PCA2'].mean()

monica =df[df['Person'].str.contains('Monica')]
monica_mean_pca1_index = monica['PCA1'].mean()
monica_mean_pca2_index = monica['PCA2'].mean()

ross =df[df['Person'].str.contains('Ross')]
ross_mean_pca1_index = ross['PCA1'].mean()
ross_mean_pca2_index = ross['PCA2'].mean()

rachel =df[df['Person'].str.contains('Rachel')]
rachel_mean_pca1_index = rachel['PCA1'].mean()
rachel_mean_pca2_index = rachel['PCA2'].mean()

phoebe =df[df['Person'].str.contains('Phoebe')]
phoebe_mean_pca1_index = phoebe['PCA1'].mean()
phoebe_mean_pca2_index = phoebe['PCA2'].mean()

text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]

colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))


key_order = [i[0] for i in centroid_list]

df2 = X.values

fig = plt.figure(figsize=(14,10))
ax = fig.add_subplot(111)

for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)




legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))

#ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=11, title_fontsize=13)


plt.annotate('Chandler', (chandler_mean_pca1_index,chandler_mean_pca2_index), xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"), fontsize=16)

plt.annotate('Joey', (joey_mean_pca1_index, joey_mean_pca2_index), xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Monica', (monica_mean_pca1_index, monica_mean_pca2_index), xytext=(10,10),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Ross', (ross_mean_pca1_index ,ross_mean_pca2_index ), xytext=(10,2),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Rachel', (rachel_mean_pca1_index,rachel_mean_pca2_index), xytext=(10,-2),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)

plt.annotate('Phoebe', (phoebe_mean_pca1_index, phoebe_mean_pca2_index), xytext=(10,11),
             textcoords='offset points', arrowprops=dict(arrowstyle="->"),fontsize=16)


plt.xlim(-1,2)
plt.ylim(-0.6,0.75)
plt.title('Average position for each character')

x_text = 'PCA1: From phrase that include a name to \" Hey \" '
plt.xlabel(x_text, ha='center', labelpad=20, fontsize=14)

y_text = 'PCA2: From phrase that include "yeah\" to \" Hi \"'
plt.ylabel(y_text, ha='center', labelpad=20, fontsize=14)

plt.savefig('average position FA.png',bbox_inches='tight')

plt.show()

### PCA3 vs PCA4

In [None]:
num_clusters = 10
X = df.loc[:,'PCA3':'PCA4']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):
    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])

centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)

centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list

In [None]:
fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')

legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    
plt.xlim(-10,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)
x_text = 'PCA3: From phrase that include a name \n to phrase that express the willingness to help and support someone'
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)

y_text = 'PCA4: From phrase that include \"what \" or  \" oh my God\" \n to phrase about relationship and with name'
plt.ylabel(y_text, ha='center', labelpad=35, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)
plt.savefig('PCA3_4_FA.png',bbox_inches='tight')
plt.show()             

### PCA5 vs PCA6

In [None]:
num_clusters = 10
X = df.loc[:,'PCA5':'PCA6']

kmeans = KMeans(
    n_clusters=num_clusters, 
    init='k-means++', 
    max_iter=100, 
    n_init=50)


label = kmeans.fit_predict(X)
df['Cluster']= label
df2 = X.values

closest_pt_idx = []
for iclust in range(kmeans.n_clusters):
    cluster_pts = df2[kmeans.labels_ == iclust]
    cluster_pts_indices = np.where(kmeans.labels_ == iclust)[0]
    cluster_cen = kmeans.cluster_centers_[iclust]
    min_idx = np.argmin([euclidean(df2[idx], cluster_cen) for idx in cluster_pts_indices])
    closest_pt_idx.append(cluster_pts_indices[min_idx])
    
centroids = kmeans.cluster_centers_

centroid_list = []
for i in np.arange(len(centroids)):
    liste= [i, centroids[i,0], centroids[i,1]]
    centroid_list.append(liste)


centroid_list = sorted(centroid_list, key=itemgetter(1))


u_labels = np.unique(label)
text = [df['Said'][closest_pt_idx[i]] for i in range(num_clusters)]
text2 = [ '\n'.join(wrap(l, 40)) for l in text]


colors = sns.color_palette('tab20', 10)
ind_col_map = {x:y for x, y in zip(df['Cluster'].unique(),colors)}
ind_col_map = dict(sorted(ind_col_map.items()))

key_order = [i[0] for i in centroid_list]

In [None]:
fig = plt.figure(figsize= (13,9))
ax = fig.add_subplot(111)


for i in u_labels:
    ax.scatter(df2[label == i , 0] , df2[label == i , 1] ,color=ind_col_map[i],  label = i, alpha= 0.25)
ax.scatter(centroids[:,0] , centroids[:,1] , s = 20, color = 'k')
#legend_list = []
#for key in ind_col_map.keys():
#    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
legend_list = []
for key in key_order:
    legend_list.append(mpatches.Patch(color=ind_col_map[key],label=text2[key]))
    
plt.xlim(-10,13)
plt.ylim(-10,14) 

ax.legend(title='Average phrase',bbox_to_anchor=(1.02, 1),handles=legend_list, loc='upper left',borderaxespad=0, fontsize=12, title_fontsize=14)
x_text = 'PCA5: From phrase about character relationships to phrase that include agreement'
plt.xlabel(x_text, ha='center', labelpad=35, fontsize=14)
an1 = plt.annotate('+', xy=(0.02, -0.08), xycoords='axes fraction', xytext=(1, -0.08), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an2 = plt.annotate('-', xy=(-0.02, -0.08), xycoords=an1, fontsize=15)


y_text = 'PCA6: From phrase that question a name \n to phrase about marriage and proposal'
plt.ylabel(y_text, ha='center', labelpad=35, fontsize=14)
an3 = plt.annotate('+', xy=(-0.06, -0.02), xycoords='axes fraction', xytext=(-0.06, 1), arrowprops=dict(arrowstyle="<-", color='lightgray'))
an4 = plt.annotate('-', xy=(-0.06, -0.04), xycoords=an3, fontsize=15)
plt.savefig('PCA5_6_FA.png',bbox_inches='tight')
plt.show()

## Part2: Logistic regression

#### ROC-AU-SCORE and Accuracy

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def logistic_function(dataset, number_pca):
    pca = 'PCA'+str(i)
    X = df.loc[:,'PCA1':pca]
    X = scaled(X)
    y = integer_encoded
    y = np.squeeze(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)

    model = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    roc_return = roc_auc_score(y_test, model.decision_function(X_test), multi_class='ovr')
    #roc_return = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo')
    return roc_return


def logistic_function_accuracy(dataset, number_pca):
    pca = 'PCA'+str(i)
    X = df.loc[:,'PCA1':pca]
    X = scaled(X)
    y = integer_encoded
    y = np.squeeze(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)

    model = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    
    return accuracy

def scaled(X):
    stand = X.std(axis=0)
    min_col = X.min(axis=0)
    X = X.values
    min_col = X.min(axis=1)
    for pca in range(len(X[0,:])):
        for row in range(len(X[:,0])):
            X[row,pca] = (X[row,pca]-min_col[pca])/stand[pca]
    return X

#### Creation dataset

In [None]:
Phoebe = df.loc[df['Person']== 'Phoebe']

#Ross = df.loc[df['Person']== 'Ross']
#df = pd.concat([Ross, Phoebe], ignore_index=True)

Chandler = df.loc[df['Person']== 'Chandler']
df = pd.concat([Chandler, Phoebe], ignore_index=True)

print(df.pivot_table(index = ['Person'], aggfunc ='size'))

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = df['Person']
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print('integer_encoded: ',integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
roc = []
for i in range(2,301,1):
    print('process until PCA',i)
    roc.append(logistic_function(df, i))
    print('done')
    

In [None]:
np.save('array_Ross_Phoebe.npy', roc)

In [None]:
array_CP = np.load('array_Chandler_Phoebe.npy')
array_RP = np.load('array_Ross_Phoebe.npy')

In [None]:
x = np.arange(2,301)
y1 = np.asarray(array_CP)
y2 = np.asarray(array_RP)

fig, ax = plt.subplots()

line1, = ax.plot(x, y1, '#9467bd', label='Phoebe vs Chandler')
line2, = ax.plot(x, y2, '#17becf', label='Phoebe vs Ross')
plt.ylim(0.51,0.8)
ax.legend(handles=[line1, line2], loc = 'lower right')
plt.xlabel('Number of PCA')
plt.ylabel('AUC')
plt.savefig('Friends_auc.png')
plt.show()

In [None]:
acc = []
for i in range(2,301,1):
    print('process until PCA',i)
    acc.append(logistic_function_accuracy(df, i))
    print('done')
    

In [None]:
np.save('accuracy_Chandler_Phoebe.npy', acc)

In [None]:
accuracy_CP = np.load('accuracy_Chandler_Phoebe.npy')
accuracy_RP = np.load('accuracy_Ross_Phoebe.npy')

In [None]:
x = np.arange(2,301)
y1 = np.asarray(accuracy_CP)
y2 = np.asarray(accuracy_RP)

fig, ax = plt.subplots()
plt.ylim(0.51,0.8) 

line1, = ax.plot(x, y1, '#9467bd', label='Phoebe vs Chandler')
line2, = ax.plot(x, y2, '#17becf', label='Phoebe vs Ross')
ax.legend(handles=[line1, line2], loc = 'lower right')
plt.xlabel('Number of PCA')
plt.ylabel('Accuracy')
plt.savefig('Friends_accuracy.png')
plt.show()


## Part 3: Heat table and graph

In [None]:
import itertools
# Creation of the binome
personnes = ['Rachel', 'Ross', 'Joey', 'Monica', 'Phoebe', 'Chandler']
couples = list(itertools.combinations(personnes, 2))

for couple in couples:
    print(couple)

In [None]:
def create_dataset(dataset, perso1, perso2):
    character1 =  dataset.loc[dataset['Person']== perso1]
    character2 =  dataset.loc[dataset['Person']== perso2]
    df = pd.concat([character1, character2], ignore_index=True)
    return df

roc =[]
coef_matrix = np.zeros((len(couples),300))
model_intercept = np.zeros(len(couples))
i=0
for couple in couples:
    print(str(couple[0]), str(couple[1]))
    df_t = create_dataset(df, str(couple[0]), str(couple[1]))

    data = df_t['Person']
    values = array(data)
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    y = integer_encoded
    y = np.squeeze(y)
    
    X = df_t.loc[:,'PCA1':'PCA300']
    stand = X.std(axis=0)
    mean_col = X.mean(axis=0)
    X = X.values

    for pca in range(len(X[0,:])):
        for row in range(len(X[:,0])):
            X[row,pca] = (X[row,pca]-mean_col[pca])/stand[pca]
    


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state = 0)
    y_train, y_test =  np.squeeze(y_train),  np.squeeze(y_test)

    model = LogisticRegression(solver='lbfgs').fit(X_train, y_train)

    coef = model.coef_
    coef_matrix[i]= coef
    intercept=model.intercept_
    model_intercept[i] = intercept[0]
    classes=model.classes_
    
    #AUC
    y_pred = model.predict(X_test)
    roc_score = roc_auc_score(y_test, model.decision_function(X_test), multi_class='ovr')
    temp = [str(couple[0]), str(couple[1]), roc_score]
    roc.append(temp)
    i+=1

In [None]:
from operator import itemgetter
sorted(roc, key=itemgetter(2))

In [None]:
data=[]
for couple in couples:
    temp = str(couple[0])+ ' and ' +str(couple[1])
    data.append(temp)
column_names = [f'PCA{i}' for i in range(1, 301)]
df_matrix = pd.DataFrame(coef_matrix, columns=column_names)
df_matrix['Pairs'] = data
#df_matrix.to_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/coef_matrix_Friends.pkl')

In [None]:
df_matrix = pd.read_pickle('C:/Users/amaca253/Desktop/Friends-Friends-Language-Analysis/BBT/coef_matrix_Friends.pkl')

In [None]:
 '''Here we would like to create a dataset such that we read each row
 will take the absolute value of the values and save the 10st values in a 
 matrix, and the values of the te corresponding pca in a list'''
df_subset = df_matrix.loc[:, 'PCA1':'PCA300']
data = np.zeros((len(df_subset),10))

text=[]
for row in np.arange(len(df_matrix)):
    selected_row = np.abs(df_subset.iloc[row])
    top_10_columns = selected_row.nlargest(10)
    data[row] =df_subset.loc[row][top_10_columns.index]
    text.append(top_10_columns.index)


column_names = [f'Column{i}' for i in range(1, 11)]
df2 = pd.DataFrame(data, columns=column_names)
df2['Pairs']= df_matrix['Pairs']


# Here we want to sort the value of the 1st column
df2 = df2.sort_values('Column1',ignore_index=True)

data2 = df2.loc[:, 'Column1':'Column10']

In [None]:

# we create a list that will take the 
data_list = []
for i in np.arange(len(data)):
    list_10 = []
    for j in np.arange(len(data[0,:])):
        list_temp = [data[i,j], text[i][j]] # we associate each cell to the corresponding PCA index before the sorting
        list_10.append(list_temp)
    data_list.append(list_10)


sort_list = sorted(data_list, key=itemgetter(0))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6)) 
cmap= sns.diverging_palette(220, 20, as_cmap=True)
maximum = data2['Column1'].abs().max()
heatmap = sns.heatmap(data2, yticklabels=df2['Pairs'], xticklabels=[], linewidths=.5, ax=ax, cmap=cmap, vmin=-maximum, vmax=maximum)

# Annotate each cell with the corresponding PCA label
for i in range(len(df_subset)):
    for j in range(10):
        cell_label = sort_list[i][j][1]
        ax.text(j + 0.5, i + 0.5, cell_label, ha='center', va='center', fontsize=8, color= 'white', fontweight = 'bold')
#plt.savefig('heatmap_friends_abs_top10.png',bbox_inches='tight')

plt.show()

In [None]:

def split_names(names):
    name_list = names.split(" and ")
    name_list = [name.strip() for name in name_list]
    return name_list


pair = []
for i in np.arange(len(df2)):
        couple = df2['Pairs'][i]
        couple = split_names(couple)
        temp = (couple[0], couple[1])
        temp = 'P('+ str(couple[0])+ '|' + str(couple[0]) + ' or ' + str(couple[1]) + ')' 
        pair.append(temp)
        print(temp)

        
df2['Pairs'] = pair

In [None]:
fig, ax = plt.subplots(figsize=(10, 6)) 
cmap= sns.diverging_palette(220, 20, as_cmap=True)
maximum = data2['Column1'].abs().max()
heatmap = sns.heatmap(data2, yticklabels=df2['Pairs'], xticklabels=[], linewidths=.5, ax=ax, cmap=cmap, vmin=-maximum, vmax=maximum)

# Annotate each cell with the corresponding PCA label
for i in range(len(df_subset)):
    for j in range(10):
        cell_label = sort_list[i][j][1]
        ax.text(j + 0.5, i + 0.5, cell_label, ha='center', va='center', fontsize=8, color= 'white', fontweight = 'bold')
plt.savefig('heatmap_Friends_abs_top10_2.png',bbox_inches='tight')

plt.show()

In [None]:
import networkx as nx

def split_names(names):
    name_list = names.split(" and ")
    name_list = [name.strip() for name in name_list]
    return name_list



G = nx.DiGraph()

# Add nodes
node = personnes
G.add_nodes_from(node)

coefficients={}
edges = []
for i in np.arange(len(df_matrix)):
    coeff = df_matrix['PCA9'][i] #9 for friends 
    print(coeff)
    if np.abs(coeff) <= 0.1:
        i+=1
    else:
        couple = split_names(df_matrix['Pairs'][i])
        temp = (couple[0], couple[1], coeff )
        edges.append(temp)


G.add_weighted_edges_from(edges)

# Create lists of edges and labels for positive and negative weights
positive_edges = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] >= 0]
negative_edges = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] < 0]
edge_labels = {(u, v): f"{w:.2f}" if w >= 0 else f"{-w:.2f}" for (u, v, w) in G.edges(data='weight')}

# Draw the graph
fig, ax = plt.subplots(figsize=(10, 8))  # Adjust the size of the plot
pos = nx.spring_layout(G, k=3)

nx.draw(G, pos, with_labels=True, edgelist=positive_edges, node_size=2500, node_color="skyblue", 
        font_size=9, arrows=True, arrowstyle= '-|>')

nx.draw(G, pos, with_labels=True, edgelist=negative_edges,node_size=2500, node_color="skyblue", 
        font_size=9, arrows=True, arrowstyle= '<|-')
        

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
#plt.savefig("graphfriends.png")