In [10]:
import numpy as np
import pandas as pd
from scipy import linalg

In [3]:
s144 = pd.read_csv('s144.csv')

In [5]:
# Read it in
path = "../data/surveys.xlsx"
survey_32W = pd.read_excel('Survey_32N and 32W consolidated.xlsx', sheet_name="Survey_32W", converters={'T3':str})
survey_32N = pd.read_excel('Survey_32N and 32W consolidated.xlsx', sheet_name="Survey_32N")

# Drop unnecessary columns
survey_32W.drop(columns=['Unnamed: 0', 'subject_id', 'image_name', 'image_name_2'], inplace=True)
survey_32N.drop(columns=['Unnamed: 0', 'subject_id', 'image_name', 'image_name_2'], inplace=True)

# Drop rows missing response
survey_32W = survey_32W[
    (survey_32W['T3'] != ' ') &
    (survey_32W['T3'] != 'none') &
    (survey_32W['T3'].notnull()) &
    (survey_32W['T3'] != 'No Comments ') &
    (survey_32W['T3'] != 0) &
    (survey_32W['T3'] != 'None') &
    (survey_32W['T3'] != '[NO ANSWER]')]

# Select responses
res_together = survey_32W['T3'][survey_32W['T1'] == "['They should be together in the same outfits']"].tolist()
res_separate = survey_32W['T3'][survey_32W['T1'] == "['They should be in separate outfits']"].tolist()

res_N = survey_32N['T5'].tolist()

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

In [7]:
embeddings = model.encode(res_together)

In [8]:
shaped = np.transpose(embeddings)

In [11]:
U, S, Vt = linalg.svd(shaped)

In [16]:
no_float = [i for i in res_N if type(i) != float]
no_unclear = [i for i in no_float if "unclear" not in i]

In [17]:
embed = model.encode(no_unclear)

In [32]:
clustering_step0 = pd.DataFrame({'Response': no_unclear, 'Embedding': embed})
clustering_step1 = pd.DataFrame({'Response #': range(0, len(embed))})

In [33]:
for i in range(0, 768):
    x = []
    for j in range(0, len(embed)):
        x.append(clustering_step0['Embedding'][j][i])
    clustering_step1['x'+str(i)] = x

In [34]:
clustering_step1

Unnamed: 0,Response #,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x758,x759,x760,x761,x762,x763,x764,x765,x766,x767
0,0,-0.537486,-0.564022,-0.239167,0.442276,-0.368987,0.351748,0.150678,-0.453972,0.004077,...,-0.733307,0.963909,-0.267696,-1.474482,-0.139646,-0.000632,-0.518430,0.029148,-0.349704,0.641719
1,1,-0.214992,0.580386,0.443751,0.505816,-0.749662,-0.185047,0.921741,-0.230475,0.477833,...,-0.089813,0.691420,0.742550,-1.448568,-0.545186,0.429773,-0.311271,0.444563,-0.531232,0.895420
2,2,-0.221185,-0.399868,-0.199939,0.446165,-0.415950,0.128539,0.631261,-0.336796,-0.060227,...,-1.046082,0.840199,-0.032127,-2.288132,-0.212985,0.444507,-0.337037,-0.118294,-0.242437,0.492242
3,3,-0.129601,-0.261325,0.835405,0.506073,0.061219,-0.294036,0.120656,-0.810101,0.435580,...,-0.442945,0.436407,-0.291818,-0.086346,0.090718,-0.196374,-0.844386,-0.142228,-0.793373,0.124300
4,4,0.026505,0.283260,0.978467,-0.014221,-0.527762,0.093359,0.749792,-0.624472,0.186433,...,-0.548998,0.189389,-0.127855,-2.026973,-0.330698,0.355806,-0.406290,0.462126,-1.371467,0.409943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,2424,-0.711136,0.113123,0.207990,0.602048,-0.546251,-0.401846,0.219555,-0.602678,0.259090,...,-0.832128,0.665866,0.062957,-1.939837,-0.322143,0.209041,0.075129,0.175395,-0.726463,0.625140
2425,2425,-0.290773,0.437435,-0.921718,0.344446,-0.601947,-0.189217,1.077664,-0.748244,-0.147181,...,-0.140875,0.655436,-0.767125,-1.078763,-0.643417,-0.131941,-0.127235,0.085191,-0.324287,-0.020368
2426,2426,-0.195325,0.333984,0.474740,-0.429105,-0.263347,0.084559,1.964459,-0.551853,-0.010522,...,-0.227955,0.239392,-0.269595,-1.785775,-0.143772,-0.077310,0.109145,0.680955,-0.894933,0.600239
2427,2427,0.109339,-0.112249,1.369917,0.347484,-0.637460,0.467727,-0.512742,0.029860,0.402951,...,-0.077584,0.862380,0.930113,-0.568004,-0.299933,0.637574,-0.340838,0.154286,-0.664447,1.367225


# Doing initial clustering

In [35]:
from sklearn.cluster import KMeans

In [38]:
kmeans = KMeans(n_clusters=20).fit(clustering_step1.iloc[:, 1:])

In [41]:
clustering_step0['Cluster'] = [i for i in kmeans.labels_]

In [43]:
clustering_step0

Unnamed: 0,Response,Embedding,Cluster
0,Negroes have been told many times they're figh...,"[-0.5374858, -0.5640219, -0.23916735, 0.442276...",6
1,I dont like the army. I had rather be on the o...,"[-0.21499223, 0.580386, 0.44375148, 0.50581574...",7
2,"I think that if were going to win this war, th...","[-0.22118495, -0.3998682, -0.19993904, 0.44616...",6
3,Above all lets stick together and beat the Axi...,"[-0.12960061, -0.26132488, 0.835405, 0.5060726...",6
4,The questions I have seen are good & now whats...,"[0.026505115, 0.28326, 0.9784666, -0.014221419...",15
...,...,...,...
2424,This is a very democratic gesture of the gover...,"[-0.71113575, 0.11312319, 0.20799007, 0.602047...",3
2425,Why is it the negro cant get rating like the w...,"[-0.2907732, 0.43743494, -0.92171776, 0.344446...",10
2426,I only say that all people should stick togeth...,"[-0.19532503, 0.33398426, 0.47473958, -0.42910...",7
2427,This his your war and I am just fighting in it.,"[0.10933893, -0.112249, 1.3699169, 0.3474837, ...",16


In [59]:
clustering_step0['bag'] = clustering_step0.Response.map(lambda t:t.replace(';','').lower().split())
clustering_step0

Unnamed: 0,Response,Embedding,Cluster,bag
0,Negroes have been told many times they're figh...,"[-0.5374858, -0.5640219, -0.23916735, 0.442276...",6,"[negroes, have, been, told, many, times, they'..."
1,I dont like the army. I had rather be on the o...,"[-0.21499223, 0.580386, 0.44375148, 0.50581574...",7,"[i, dont, like, the, army., i, had, rather, be..."
2,"I think that if were going to win this war, th...","[-0.22118495, -0.3998682, -0.19993904, 0.44616...",6,"[i, think, that, if, were, going, to, win, thi..."
3,Above all lets stick together and beat the Axi...,"[-0.12960061, -0.26132488, 0.835405, 0.5060726...",6,"[above, all, lets, stick, together, and, beat,..."
4,The questions I have seen are good & now whats...,"[0.026505115, 0.28326, 0.9784666, -0.014221419...",15,"[the, questions, i, have, seen, are, good, &, ..."
...,...,...,...,...
2424,This is a very democratic gesture of the gover...,"[-0.71113575, 0.11312319, 0.20799007, 0.602047...",3,"[this, is, a, very, democratic, gesture, of, t..."
2425,Why is it the negro cant get rating like the w...,"[-0.2907732, 0.43743494, -0.92171776, 0.344446...",10,"[why, is, it, the, negro, cant, get, rating, l..."
2426,I only say that all people should stick togeth...,"[-0.19532503, 0.33398426, 0.47473958, -0.42910...",7,"[i, only, say, that, all, people, should, stic..."
2427,This his your war and I am just fighting in it.,"[0.10933893, -0.112249, 1.3699169, 0.3474837, ...",16,"[this, his, your, war, and, i, am, just, fight..."


In [60]:
clustering_step0['len'] = clustering_step0.bag.map(len)
clustering_step0.head()

Unnamed: 0,Response,Embedding,Cluster,bag,len
0,Negroes have been told many times they're figh...,"[-0.5374858, -0.5640219, -0.23916735, 0.442276...",6,"[negroes, have, been, told, many, times, they'...",140
1,I dont like the army. I had rather be on the o...,"[-0.21499223, 0.580386, 0.44375148, 0.50581574...",7,"[i, dont, like, the, army., i, had, rather, be...",36
2,"I think that if were going to win this war, th...","[-0.22118495, -0.3998682, -0.19993904, 0.44616...",6,"[i, think, that, if, were, going, to, win, thi...",94
3,Above all lets stick together and beat the Axi...,"[-0.12960061, -0.26132488, 0.835405, 0.5060726...",6,"[above, all, lets, stick, together, and, beat,...",41
4,The questions I have seen are good & now whats...,"[0.026505115, 0.28326, 0.9784666, -0.014221419...",15,"[the, questions, i, have, seen, are, good, &, ...",32


In [44]:
import seaborn as sns; sns.set()
import re
import itertools

# First Crack at TFIDF

In [53]:
g = clustering_step0.groupby('Cluster')
g.size().head()

Cluster
0    141
1    115
2     83
3     96
4     64
dtype: int64

In [62]:
pd.Series(clustering_step0.bag[0]).value_counts()

the             6
a               5
to              5
negroes         5
and             4
               ..
from            1
war             1
since           1
countries.      1
emancipation    1
Length: 91, dtype: int64

In [65]:
TF = clustering_step0.bag.apply(lambda bag : pd.Series(bag).value_counts())
TF

Unnamed: 0,the,a,to,negroes,and,that,have,of,for,was,...,italian,hardhships,follow.,it!,experiencing,pockets.,[insertion]in,drill.,sided,afarie
0,6.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,...,,,,,,,,,,
1,5.0,,1.0,,1.0,,2.0,1.0,,,...,,,,,,,,,,
2,11.0,2.0,3.0,,2.0,6.0,1.0,,,,...,,,,,,,,,,
3,1.0,,2.0,,1.0,,,1.0,,,...,,,,,,,,,,
4,2.0,,1.0,,,,2.0,1.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,6.0,3.0,3.0,,3.0,,1.0,6.0,2.0,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,
2425,4.0,2.0,,,,,,2.0,,,...,,,,,,,,1.0,1.0,1.0
2426,2.0,1.0,2.0,,1.0,1.0,1.0,,,,...,,,,,,,,,,
2427,,,,,1.0,,,,,,...,,,,,,,,,,


In [66]:
g = clustering_step0.groupby('Cluster')
g.size()

Cluster
0     141
1     115
2      83
3      96
4      64
5     242
6     266
7     183
8      87
9      93
10    123
11     38
12     71
13    107
14    124
15     46
16    115
17    230
18    180
19     25
dtype: int64

In [54]:
clusters = g.aggregate(lambda listofbags: 
                list(itertools.chain.from_iterable(listofbags)))

In [67]:
clusters = g.bag.aggregate(lambda listofbags: list(itertools.chain.from_iterable(listofbags)))
clusters

Cluster
0     [i, consider, this, questionnaire, the, best, ...
1     [the, infantry, is, all, right, but, it, is, t...
2     [i, think, this, is, a, very, goot, idia, gett...
3     [i, think, this, questionnaire, was, a, good, ...
4     [i, have, a, good, commanding, officer, +, all...
5     [why, do, they, let, all, white, drive, trucks...
6     [negroes, have, been, told, many, times, they'...
7     [i, dont, like, the, army., i, had, rather, be...
8     [question, no, 3., i, tried, to, volenter, but...
9     [the, questionnaire, was, all, right, and, i, ...
10    [a, colored, soldier, will, never, get, the, e...
11    [thank, you, i, think, it, is, a, fairly, good...
12    [i, don't, have, any, trouble, with, any, body...
13    [we, should, have, more, negro, office., we, s...
14    [well, i, only, have, a, short, one, that, is,...
15    [the, questions, i, have, seen, are, good, &, ...
16    [think, we, going, to, win, this, war, since, ...
17    [i, have, been, in, pain, ever, si

In [70]:
TF = clusters.apply(lambda bag : pd.Series(bag).value_counts())
TF

Unnamed: 0_level_0,the,i,a,to,in,be,and,negro,as,of,...,airport,tolling,prectly,give.,prope,thaying,comision,unprediduced,exchant,commince
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,477.0,214.0,201.0,199.0,173.0,171.0,169.0,121.0,109.0,108.0,...,,,,,,,,,,
1,192.0,159.0,77.0,88.0,66.0,31.0,68.0,1.0,13.0,55.0,...,,,,,,,,,,
2,64.0,67.0,38.0,66.0,28.0,29.0,29.0,1.0,9.0,24.0,...,,,,,,,,,,
3,280.0,138.0,135.0,155.0,89.0,41.0,104.0,19.0,38.0,99.0,...,,,,,,,,,,
4,112.0,99.0,58.0,53.0,55.0,16.0,38.0,1.0,10.0,34.0,...,,,,,,,,,,
5,1548.0,472.0,576.0,715.0,517.0,287.0,570.0,289.0,260.0,500.0,...,,,,,,,,,,
6,1660.0,592.0,592.0,840.0,505.0,424.0,712.0,316.0,358.0,421.0,...,,,,,,,,,,
7,765.0,297.0,344.0,414.0,281.0,221.0,373.0,53.0,160.0,271.0,...,,,,,,,,,,
8,168.0,182.0,80.0,110.0,83.0,46.0,63.0,4.0,14.0,51.0,...,,,,,,,,,,
9,60.0,103.0,73.0,58.0,15.0,12.0,38.0,1.0,19.0,24.0,...,,,,,,,,,,


In [72]:
IDF = np.log(len(TF)/TF.count())
IDF.sort_values()

the              0.000000
don't            0.000000
thing            0.000000
what             0.000000
so               0.000000
                   ...   
corporaction.    2.995732
socolize         2.995732
(red)            2.995732
system,          2.995732
commince         2.995732
Length: 10411, dtype: float64

In [73]:
TFIDF = TF * IDF
TFIDF

Unnamed: 0_level_0,the,i,a,to,in,be,and,negro,as,of,...,airport,tolling,prectly,give.,prope,thaying,comision,unprediduced,exchant,commince
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,10.309952,0.0,0.0,0.0,8.668567,27.00037,11.484296,5.539676,...,,,,,,,,,,
1,0.0,0.0,3.949584,0.0,0.0,0.0,3.487944,0.223144,1.369687,2.821131,...,,,,,,,,,,
2,0.0,0.0,1.949145,0.0,0.0,0.0,1.487506,0.223144,0.948245,1.231039,...,,,,,,,,,,
3,0.0,0.0,6.924595,0.0,0.0,0.0,5.334503,4.239727,4.0037,5.078036,...,,,,,,,,,,
4,0.0,0.0,2.975011,0.0,0.0,0.0,1.949145,0.223144,1.053605,1.743972,...,,,,,,,,,,
5,0.0,0.0,29.544938,0.0,0.0,0.0,29.237178,64.488486,27.393734,25.646647,...,,,,,,,,,,
6,0.0,0.0,30.36563,0.0,0.0,0.0,36.520826,70.513362,37.719065,21.594477,...,,,,,,,,,,
7,0.0,0.0,17.644893,0.0,0.0,0.0,19.132399,11.826608,16.857683,13.900483,...,,,,,,,,,,
8,0.0,0.0,4.103464,0.0,0.0,0.0,3.231478,0.892574,1.475047,2.615958,...,,,,,,,,,,
9,0.0,0.0,3.74441,0.0,0.0,0.0,1.949145,0.223144,2.00185,1.231039,...,,,,,,,,,,


In [135]:
final = TFIDF.fillna(0)

In [177]:
np.argsort(np.array(final.loc[2]))[-4:]

array([1805, 1797, 1801, 1794], dtype=int64)

In [183]:
np.argsort(np.array(final.loc[0]))[-4:]

array([12, 34, 19,  7], dtype=int64)

In [184]:
(final.loc[:0].values.tolist())[0]

[0.0,
 0.0,
 10.309952171897647,
 0.0,
 0.0,
 0.0,
 8.668566751496032,
 27.00036970901938,
 11.484296206703073,
 5.5396757938554515,
 5.2832093219177,
 4.872862966817296,
 14.78922258429752,
 0.0,
 4.5651032004919925,
 4.462516611716892,
 0.0,
 4.411223317329341,
 8.850283315257414,
 22.151519578787124,
 0.0,
 0.0,
 7.691317643021324,
 3.7444104902911852,
 7.480596611705671,
 3.641823901516084,
 7.164515064732192,
 0.0,
 5.268025782891318,
 7.313351827399872,
 9.818316257825229,
 2.256904953052221,
 6.825795038906548,
 6.825795038906548,
 17.66209955979063,
 3.898339079339575,
 1.897851892339368,
 8.033167847311551,
 3.6876180480239222,
 3.582257532366096,
 1.692678714789166,
 1.6413854204016154,
 1.6413854204016154,
 8.918144246005205,
 4.875567884933248,
 4.550530025937698,
 2.8447339227613115,
 4.388011096439923,
 1.3336256540763125,
 0.0,
 4.062973237444374,
 4.062973237444374,
 1.2823323596887621,
 6.904369738842741,
 3.7379353784488236,
 3.7379353784488236,
 2.31793134447218,
 7.

In [189]:
np.argmax(np.array(final.loc[0].values.tolist())[0])

0

# Making dataframe of 4 word dictionaries

In [195]:
listoflists = []
for i in range(0, 20):
    listoflists.append([final.loc[i].index[j] for j in np.argsort(np.array(final.loc[i]))[-4:]])

In [164]:
[(final.loc[:8].values.tolist())[0][j] for j in list(np.argsort(np.array(final.loc[:8].values.tolist())[0])[-4:])]

[14.78922258429752, 17.66209955979063, 22.151519578787124, 27.00036970901938]

In [196]:
dictionary = pd.DataFrame({'Cluster': range(0, 20), 'Individual Dictionary': listoflists})

In [197]:
dictionary

Unnamed: 0,Cluster,Individual Dictionary
0,0,"[better, negroes, white, negro]"
1,1,"[strenuous, j.a.f.s, poor, army]"
2,2,"[gi, mortuary, science, nergro]"
3,3,"[a, department, express, army]"
4,4,"[cin, ball,, nice., army]"
5,5,"[fight, negro, southern, white]"
6,6,"[war, fight, white, negro]"
7,7,"[a, army, and, white]"
8,8,"[drafted, haven, army?, army]"
9,9,"[useful, questionnaires, questions, informative]"
