In [127]:
%matplotlib notebook
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import functools


In [26]:
path = "data/health+news+in+twitter/Health-Tweets/bbchealth.txt"
df_bbc = pd.read_csv(path, sep="|").iloc[:, -1]
print(type(df_bbc))
df_bbc_len = df_bbc.to_numpy().shape[0]
df_bbc, df_bbc_len

<class 'pandas.core.series.Series'>


(0       GP workload harming care - BMA poll http://bbc...
 1       Short people's 'heart risk greater' http://bbc...
 2       New approach against HIV 'promising' http://bb...
 3       Coalition 'undermined NHS' - doctors http://bb...
 4       Review of case against NHS manager http://bbc....
                               ...                        
 3923    Baby born after ovaries 'reawakened' http://bb...
 3924    Identical triplets born against odds http://bb...
 3925    Hospital failed to make improvements http://bb...
 3926    New patient targets pledge for NHS http://bbc....
 3927    C. diff 'manslaughter' inquiry call http://bbc...
 Name: Breast cancer risk test devised http://bbc.in/1CimpJF, Length: 3928, dtype: object,
 3928)

In [27]:
path = "data/health+news+in+twitter/Health-Tweets/cbchealth.txt"
df_cbc = pd.read_csv(path, sep="|", on_bad_lines='skip').iloc[:, -1]
print(type(df_cbc))
df_cbc_len = df_cbc.to_numpy().shape[0]
df_cbc, df_cbc_len

<class 'pandas.core.series.Series'>


(0       Sabra hummus recalled in U.S. http://www.cbc.c...
 1       U.S. sperm bank sued by Canadian couple didn't...
 2       Manitoba pharmacists want clampdown on Tylenol...
 3       Mom of 7 'spooked' by vaccinations reverses st...
 4       Hamilton police send mental health pros to the...
                               ...                        
 3722    Rural doctors mentor medical students in U of ...
 3723    More men suffering from eating disorders, says...
 3724    5 sources of objective drug information sugges...
 3725    Top five ways to avoid allergies this spring h...
 3726    Health Canada to stop sales of small magnets h...
 Name: Drugs need careful monitoring for expiry dates, pharmacists say http://www.cbc.ca/news/health/drugs-need-careful-monitoring-for-expiry-dates-pharmacists-say-1.3026749?cmp=rss, Length: 3727, dtype: object,
 3727)

In [28]:
path = "data/health+news+in+twitter/Health-Tweets/cnnhealth.txt"
df_cnn = pd.read_csv(path, sep="|", on_bad_lines='skip').iloc[:, -1]
print(type(df_cnn))
df_cnn_len = df_cnn.to_numpy().shape[0]
df_cnn, df_cnn_len

<class 'pandas.core.series.Series'>


(0       A plant-based diet that incorporates fish may ...
 1       It doesn't take much to damage your hearing at...
 2       RT @CNN: Forever young? Discover this island’s...
 3       RT @CNN: Is post-traumatic stress disorder in ...
 4       Maysoon Zayid, a touring standup comic with Ce...
                               ...                        
 4039    RT @EverydayHealth: Want killer abs? @JillianM...
 4040    Medicare at stake -- @sanjayguptaCNN talks abo...
 4041    Ann Romney talks about her experience with MS ...
 4042    Make sure your first marathon isn't your last!...
 4043    Robin Roberts' cancer diagnosis http://at.cnn....
 Name: An abundance of online info can turn us into e-hypochondriacs. Or, worse, lead us to neglect getting the care we need http://cnn.it/1L1t1Fv, Length: 4044, dtype: object,
 4044)

In [29]:
df_total = pd.concat([df_bbc, df_cbc, df_cnn])
df_total

0       GP workload harming care - BMA poll http://bbc...
1       Short people's 'heart risk greater' http://bbc...
2       New approach against HIV 'promising' http://bb...
3       Coalition 'undermined NHS' - doctors http://bb...
4       Review of case against NHS manager http://bbc....
                              ...                        
4039    RT @EverydayHealth: Want killer abs? @JillianM...
4040    Medicare at stake -- @sanjayguptaCNN talks abo...
4041    Ann Romney talks about her experience with MS ...
4042    Make sure your first marathon isn't your last!...
4043    Robin Roberts' cancer diagnosis http://at.cnn....
Length: 11699, dtype: object

In [30]:
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(df_total).toarray()
y = np.array((["bbc"] * df_bbc_len + ["cbc"] * df_cbc_len + ["cnn"] * df_cnn_len))
X, y

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array(['bbc', 'bbc', 'bbc', ..., 'cnn', 'cnn', 'cnn'], dtype='<U3'))

In [31]:
pca = PCA(n_components=2)
res = pca.fit_transform(X)
res

array([[-0.09856497, -0.04505445],
       [-0.10547444, -0.05195151],
       [-0.10169382, -0.04402714],
       ...,
       [ 0.12759319, -0.06158463],
       [ 0.16749147, -0.09793305],
       [ 0.12840873, -0.08960084]])

In [32]:
bbc = res[0:df_bbc_len, :]

cbc = res[df_bbc_len: df_bbc_len + df_cbc_len, :]

cnn = res[df_bbc_len + df_cbc_len:, :]

plt.scatter(bbc[:, 0], bbc[:, 1])
plt.scatter(cbc[:, 0], cbc[:, 1])
plt.scatter(cnn[:, 0], cnn[:, 1])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x143db7f7d90>

In [33]:
train, test, _, test_y_label = train_test_split(X, y, test_size=0.1, shuffle=True)
train.shape, test.shape

((10529, 23782), (1170, 23782))

In [34]:
model = KMeans(n_clusters=3)
model.fit(train)

array([0, 0, 1, ..., 0, 1, 2])

In [36]:
centers = model.cluster_centers_
centers

array([[ 1.17425256e-04,  1.68968269e-03,  8.80914265e-20, ...,
        -6.77626358e-20, -8.47032947e-20,  6.09863722e-20],
       [-1.15196481e-19,  2.01323579e-03,  1.08420217e-19, ...,
        -5.42101086e-20, -6.77626358e-20,  1.10157666e-04],
       [-1.35525272e-19,  1.40711849e-03,  9.39146862e-05, ...,
         9.57497321e-05,  5.61150676e-05,  6.09863722e-20]])

In [37]:
reduc_cen = pca.transform(centers)
reduc_cen

array([[-0.10995684, -0.05441467],
       [-0.00265865,  0.10866911],
       [ 0.10617362, -0.04220822]])

In [38]:
plt.scatter(bbc[:, 0], bbc[:, 1], c="blue")
plt.scatter(cbc[:, 0], cbc[:, 1], c="orange")
plt.scatter(cnn[:, 0], cnn[:, 1], c="green")
plt.scatter(reduc_cen[0, 0], reduc_cen[0, 1], c="black")
plt.scatter(reduc_cen[1, 0], reduc_cen[1, 1], c="black")
plt.scatter(reduc_cen[2, 0], reduc_cen[2, 1], c="black")

<matplotlib.collections.PathCollection at 0x143db89cf70>

In [39]:
bbc_arr, cbc_arr, cnn_arr = X[0:df_bbc_len, :], X[df_bbc_len: df_bbc_len + df_cbc_len, :], X[df_bbc_len + df_cbc_len:, :]
bbc_arr, cbc_arr, cnn_arr

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [40]:
bbc_mean, cbc_mean, cnn_mean = bbc_arr.mean(axis=0).reshape(1, -1), cbc_arr.mean(axis=0).reshape(1, -1), cnn_arr.mean(axis=0).reshape(1, -1)
print(bbc_mean.shape)
bbc_mean, cbc_mean, cnn_mean

(1, 23782)


(array([[0.00010651, 0.00187275, 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0.00000000e+00, 2.23100679e-03, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 9.47290906e-05]]),
 array([[0.00000000e+00, 1.31968309e-03, 8.73425161e-05, ...,
         8.90491450e-05, 5.21881230e-05, 0.00000000e+00]]))

In [41]:
bbc_mean_reduc = pca.transform(bbc_mean)
cbc_mean_reduc = pca.transform(cbc_mean)
cnn_mean_reduc = pca.transform(cnn_mean)
bbc_mean_reduc, cbc_mean_reduc, cnn_mean_reduc

(array([[-0.10999813, -0.0545872 ]]),
 array([[0.00050354, 0.10462424]]),
 array([[ 0.10637882, -0.04340159]]))

In [42]:
reduc_means = [bbc_mean_reduc, cbc_mean_reduc, cnn_mean_reduc]
plt.scatter(bbc[:, 0], bbc[:, 1], c="blue")
plt.scatter(cbc[:, 0], cbc[:, 1], c="orange")
plt.scatter(cnn[:, 0], cnn[:, 1], c="green")
plt.scatter(bbc_mean_reduc[0,0], bbc_mean_reduc[0,1], c="black")
plt.scatter(cbc_mean_reduc[0,0], cbc_mean_reduc[0,1], c="black")
plt.scatter(cnn_mean_reduc[0,0], cnn_mean_reduc[0,1], c="black")

<matplotlib.collections.PathCollection at 0x143db90d960>

In [43]:

def match_closest_cluster(actual_means, predicted_means, names, labels):
    n = len(actual_means)
    mapping = {

    }
    for i in range(n):
        mn = 1000000000
        index = -1
        for j in range(n):
            dist = np.linalg.norm(actual_means[i].flatten() - predicted_means[j])
            print(dist)
            if dist < mn:
                mn = dist
                index = j
        mapping[names[i]] = labels[index]

    return mapping

pred = model.predict(model.cluster_centers_)
pred

array([0, 1, 2])

In [44]:
mapping = match_closest_cluster(reduc_means, reduc_cen, ["bbc", "cbc", "cnn"], pred)
mapping

0.00017739806611306172
0.1953826617331633
0.21652589862339072
0.1936359252229938
0.005134238569461214
0.18090312186124516
0.2166158067792565
0.1871220694530724
0.0012108827614242194


{'bbc': 0, 'cbc': 1, 'cnn': 2}

In [45]:
plt.scatter(bbc[:, 0], bbc[:, 1], c="blue")
plt.scatter(cbc[:, 0], cbc[:, 1], c="orange")
plt.scatter(cnn[:, 0], cnn[:, 1], c="green")
plt.scatter(reduc_cen[mapping['bbc'], 0], reduc_cen[mapping['bbc'], 1], c="purple")
plt.scatter(reduc_cen[mapping['cbc'], 0], reduc_cen[mapping['cbc'], 1], c="green")
plt.scatter(reduc_cen[mapping['cnn'], 0], reduc_cen[mapping['cnn'], 1], c="orange")
plt.annotate("bbc", (reduc_cen[mapping['bbc'], 0], reduc_cen[mapping['bbc'], 1]))
plt.annotate("cbc", (reduc_cen[mapping['cbc'], 0], reduc_cen[mapping['cbc'], 1]))
plt.annotate("cnn", (reduc_cen[mapping['cnn'], 0], reduc_cen[mapping['cnn'], 1]))

Text(0.10617362387655076, -0.04220822265611092, 'cnn')

In [46]:
test_predicted = model.predict(test)
test_predicted

array([0, 0, 1, ..., 0, 1, 2])

In [47]:
reshape = test_y_label.reshape(-1, 1)
reshape.shape

(1170, 1)

In [48]:
applied = np.apply_along_axis(lambda x: mapping[x[0]], axis=1, arr=test_y_label.reshape(-1, 1))
applied

array([0, 0, 1, ..., 0, 1, 2])

In [49]:
np.sum(applied == test_predicted)

1137

In [50]:
test_predicted.shape

(1170,)

In [69]:
path = 'data/health+news+in+twitter/Health-Tweets'

dfs = []
lens = []
names = []
k_clus = 5
cnt = 0
for filename in os.listdir(path):
    f = os.path.join(path, filename)
    if os.path.isfile(f) and cnt < k_clus:
        dfs.append(pd.read_csv(f, sep="|", on_bad_lines='skip', encoding="latin1").iloc[:, -1])
        lens.append(dfs[-1].size)
        names.append(filename[:-4])
        cnt += 1

lens = np.array([0] + lens)
aggregated_df = pd.concat(dfs)
aggregated_df, names, lens

(0       GP workload harming care - BMA poll http://bbc...
 1       Short people's 'heart risk greater' http://bbc...
 2       New approach against HIV 'promising' http://bb...
 3       Coalition 'undermined NHS' - doctors http://bb...
 4       Review of case against NHS manager http://bbc....
                               ...                        
 1994    Researchers use video games to study how sleep...
 1995    Are energy drinks really that bad for you? htt...
 1996    Men suffering from #depression may also suffer...
 1997    #Thanksgiving science: Why #gratitude is good ...
 1998    Clinton Kellys fresh and #fruity take on #hol...
 Length: 16936, dtype: object,
 ['bbchealth', 'cbchealth', 'cnnhealth', 'everydayhealth', 'foxnewshealth'],
 array([   0, 3928, 3727, 4044, 3238, 1999]))

In [181]:
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(aggregated_df).toarray()
y = np.array(functools.reduce(lambda a, b: a + b, [[names[i]] * lens[i + 1] for i in range(len(names))]))
X, y

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array(['bbchealth', 'bbchealth', 'bbchealth', ..., 'foxnewshealth',
        'foxnewshealth', 'foxnewshealth'], dtype='<U14'))

In [182]:
train, test, _, test_y_label = train_test_split(X, y, test_size=0.1, shuffle=True)
train, test, _, test_y_label

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array(['everydayhealth', 'cnnhealth', 'cbchealth', ..., 'bbchealth',
        'foxnewshealth', 'everydayhealth'], dtype='<U14'),
 array(['cbchealth', 'everydayhealth', 'everydayhealth', ...,
        'foxnewshealth', 'cbchealth', 'cbchealth'], dtype='<U14'))

In [183]:
pca = PCA(n_components=3)
reduc_X = pca.fit_transform(X)
reduc_X

array([[ 0.12509172,  0.01893919, -0.03730462],
       [ 0.13665774,  0.01261094, -0.05033798],
       [ 0.13682302,  0.0189312 , -0.03562494],
       ...,
       [-0.00254024, -0.07760651, -0.03187585],
       [-0.07043945, -0.07826356,  0.00264425],
       [-0.02956929, -0.06636476, -0.00736317]])

In [184]:
clusters_means = []
clusters = []
prefix = lens.cumsum()
prefix

array([    0,  3928,  7655, 11699, 14937, 16936])

In [185]:
for i in range(lens.size - 1):
    clusters.append(reduc_X[prefix[i]:prefix[i+1],:])
    clusters_means.append(X[prefix[i]:prefix[i+1],:].mean(axis=0))

clusters_means, clusters, len(clusters)

([array([0.00010171, 0.00187191, 0.        , ..., 0.        , 0.        ,
         0.        ]),
  array([0.        , 0.00224272, 0.        , ..., 0.        , 0.        ,
         0.        ]),
  array([0.00000000e+00, 1.33511908e-03, 8.77821625e-05, ...,
         5.28469375e-05, 8.44114572e-05, 0.00000000e+00]),
  array([1.78937949e-04, 2.70675201e-04, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 8.45450557e-05]),
  array([0.        , 0.00182102, 0.        , ..., 0.        , 0.        ,
         0.        ])],
 [array([[ 0.12509172,  0.01893919, -0.03730462],
         [ 0.13665774,  0.01261094, -0.05033798],
         [ 0.13682302,  0.0189312 , -0.03562494],
         ...,
         [ 0.13197088,  0.01253093, -0.04121266],
         [ 0.17345575,  0.02420369, -0.05235318],
         [ 0.13224475,  0.02044482, -0.04430987]]),
  array([[ 0.07465552,  0.01928294,  0.18903037],
         [ 0.00328427,  0.00704507,  0.16447867],
         [ 0.00537464,  0.00302274,  0.16351695],


In [186]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
colors = ["red", "blue", "green", "purple", "orange"]
for i in range(len(clusters)):
    print(i)
    temp = clusters[i]
    ax.scatter(temp[:, 0], temp[:, 1], temp[:, 2], c=colors[i], label=names[i])

ax.legend(loc="upper left")

<IPython.core.display.Javascript object>

0
1
2
3
4


<matplotlib.legend.Legend at 0x144e8b0a950>

In [187]:
reduc_means = pca.transform(clusters_means)
reduc_means

array([[ 0.14352356,  0.02411771, -0.0443672 ],
       [-0.00017383, -0.03407663,  0.09386156],
       [-0.07816872,  0.09940448, -0.0176295 ],
       [-0.06337054, -0.06589701, -0.03236114],
       [-0.02091275, -0.07821307,  0.00026585]])

In [188]:
model = KMeans(n_clusters=k_clus)
model.fit(X)

In [224]:

def match_closest_cluster(actual_means, predicted_means, names, labels):
    n = len(actual_means)
    mapping = {

    }
    used = []
    for i in range(n):
        mn = 1000000000
        index = -1
        for j in range(n):
            dist = np.linalg.norm(actual_means[i].flatten() - predicted_means[j])
            if dist < mn and j not in used:
                mn = dist
                index = j
        mapping[names[i]] = labels[index]
        used.append(index)

    return mapping



In [225]:
reduc_centers = pca.transform(model.cluster_centers_)
pred = model.predict(model.cluster_centers_)
reduc_means,reduc_centers,names,pred.tolist()

(array([[ 0.14352356,  0.02411771, -0.0443672 ],
        [-0.00017383, -0.03407663,  0.09386156],
        [-0.07816872,  0.09940448, -0.0176295 ],
        [-0.06337054, -0.06589701, -0.03236114],
        [-0.02091275, -0.07821307,  0.00026585]]),
 array([[-0.05770307,  0.0091632 , -0.01873248],
        [-0.08901234,  0.13668954, -0.0153599 ],
        [-0.03573906, -0.09296796, -0.01636939],
        [ 0.143368  ,  0.02413678, -0.04420698],
        [ 0.02018274,  0.00566116,  0.18541162]]),
 ['bbchealth', 'cbchealth', 'cnnhealth', 'everydayhealth', 'foxnewshealth'],
 [0, 1, 2, 3, 4])

In [228]:
mapping = match_closest_cluster(reduc_means, reduc_centers, names, pred.tolist())
mapping

{'bbchealth': 3,
 'cbchealth': 4,
 'cnnhealth': 1,
 'everydayhealth': 2,
 'foxnewshealth': 0}

In [229]:
applied = np.apply_along_axis(lambda x: mapping[x[0]], axis=1, arr=test_y_label.reshape(-1, 1))
test_predicted = model.predict(test)
applied, test_predicted

(array([4, 2, 2, ..., 0, 4, 4]), array([2, 2, 2, ..., 2, 2, 4]))

In [230]:
np.sum(applied == test_predicted)

997

In [231]:
test.shape

(1694, 30112)

In [239]:
np.random.binomial(1, np.array([0.1]).repeat(repeats=100000))

array([0, 0, 0, ..., 0, 0, 0])

array([1])