In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import  calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Loading in the cleaned DF
with open("./profiles.pkl",'rb') as fp:
    df = pickle.load(fp)

# Viewing the DF    
df


Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,5,3,4,1,3,6,7
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,7,9,5,1,9,4,0
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,1,2,6,5,6,5,4
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,5,2,7,8,2,6,6
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,6,6,6,4,3,6,3
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble internet maven. Alcohol evangelist.,7,9,0,0,2,2,4
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,4,3,6,3,7,7,2
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,1,4,0,4,9,2,5
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,6,2,0,3,8,9,1


In [3]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = df[['Bios']].join(pd.DataFrame(scaler.fit_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))
df

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,0.555556,0.333333,0.444444,0.111111,0.333333,0.666667,0.777778
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,0.777778,1.000000,0.555556,0.111111,1.000000,0.444444,0.000000
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,0.111111,0.222222,0.666667,0.555556,0.666667,0.555556,0.444444
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,0.555556,0.222222,0.777778,0.888889,0.222222,0.666667,0.666667
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,0.666667,0.666667,0.666667,0.444444,0.333333,0.666667,0.333333
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble internet maven. Alcohol evangelist.,0.777778,1.000000,0.000000,0.000000,0.222222,0.222222,0.444444
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,0.444444,0.333333,0.666667,0.333333,0.777778,0.777778,0.222222
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,0.111111,0.444444,0.000000,0.444444,1.000000,0.222222,0.555556
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,0.666667,0.222222,0.000000,0.333333,0.888889,1.000000,0.111111


In [4]:
# Instantiating the Vectorizer
vectorizer = TfidfVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])
x

<6600x110 sparse matrix of type '<class 'numpy.float64'>'
	with 85475 stored elements in Compressed Sparse Row format>

In [5]:
# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
df_wrds

Unnamed: 0,advocate,aficionado,alcohol,alcoholaholic,amateur,analyst,animals,apathy,avid,award,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
1,0.0,0.000000,0.000000,0.0,0.000000,0.30791,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.238969,0.0,0.000000,0.0,0.0,0.00000,0.0
2,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.336385,0.000000,...,0.0,0.000000,0.0,0.254821,0.0,0.000000,0.0,0.0,0.25223,0.0
3,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.247144,0.0,0.000000,0.0,0.0,0.00000,0.0
4,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,0.0,0.000000,0.272898,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
6596,0.0,0.000000,0.242540,0.0,0.000000,0.00000,0.0,0.0,0.317036,0.320020,...,0.0,0.000000,0.0,0.240163,0.0,0.320020,0.0,0.0,0.00000,0.0
6597,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,...,0.0,0.370750,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
6598,0.0,0.000000,0.000000,0.0,0.369445,0.00000,0.0,0.0,0.000000,0.366409,...,0.0,0.000000,0.0,0.000000,0.0,0.366409,0.0,0.0,0.00000,0.0


In [6]:
# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

# Viewing the new DF
new_df

Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,advocate,aficionado,alcohol,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
0,0.555556,0.333333,0.444444,0.111111,0.333333,0.666667,0.777778,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
1,0.777778,1.000000,0.555556,0.111111,1.000000,0.444444,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.238969,0.0,0.000000,0.0,0.0,0.00000,0.0
2,0.111111,0.222222,0.666667,0.555556,0.666667,0.555556,0.444444,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.254821,0.0,0.000000,0.0,0.0,0.25223,0.0
3,0.555556,0.222222,0.777778,0.888889,0.222222,0.666667,0.666667,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.247144,0.0,0.000000,0.0,0.0,0.00000,0.0
4,0.666667,0.666667,0.666667,0.444444,0.333333,0.666667,0.333333,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,0.777778,1.000000,0.000000,0.000000,0.222222,0.222222,0.444444,0.0,0.000000,0.272898,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
6596,0.444444,0.333333,0.666667,0.333333,0.777778,0.777778,0.222222,0.0,0.000000,0.242540,...,0.0,0.000000,0.0,0.240163,0.0,0.320020,0.0,0.0,0.00000,0.0
6597,0.111111,0.444444,0.000000,0.444444,1.000000,0.222222,0.555556,0.0,0.000000,0.000000,...,0.0,0.370750,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.0
6598,0.666667,0.222222,0.000000,0.333333,0.888889,1.000000,0.111111,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.366409,0.0,0.0,0.00000,0.0


In [15]:
df_pca=new_df.values.tolist()
len(df_final[0])

117

In [None]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Plotting to determine how many features should the dataset be reduced to
plt.style.use("bmh")
plt.figure(figsize=(14,4))
plt.plot(range(1,new_df.shape[1]+1), pca.explained_variance_ratio_.cumsum())
plt.show()

# Finding the exact number of features that explain at least 95% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_95 = len(total_explained_variance[total_explained_variance>=.95])
n_to_reach_95 = new_df.shape[1] - n_over_95

print(f"Number features: {n_to_reach_95}\nTotal Variance Explained: {total_explained_variance[n_to_reach_95]}")

In [None]:
# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_95)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

In [30]:
import pinecone

pinecone.init(api_key="4af95746-9e0d-42b0-a764-03d7c774b938", environment="gcp-starter")
index = pinecone.Index("blend-research")

In [31]:
import itertools

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


In [32]:
vector_dim = len(df_pca[0])
vector_count = len(df_pca)
print(vector_dim, vector_count)
# Example generator that generates many (id, vector) pairs
example_data_generator = map(lambda i: (f'persona-{i}', list(df_pca[i])), range(0, vector_count))

# Upsert data with 100 vectors per upsert request
for _chunk in chunks(example_data_generator, batch_size=100):
    index.upsert(vectors=_chunk)
    print("chunk upserted")
    
  

117 6600
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted
chunk upserted


In [33]:
index.describe_index_stats()

{'dimension': 117,
 'index_fullness': 0.066,
 'namespaces': {'': {'vector_count': 6600}},
 'total_vector_count': 6600}

In [35]:
df

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
1468,Bacon lover. Infuriatingly humble explorer. Total internet evangelist. Travel trailblazer.,1.0,0.444444,0.777778,1.0,1.0,0.444444,0.777778


In [39]:
df.iloc[5098]

Bios        Pop culture enthusiast. Tv practitioner. Unable to type with boxing gloves on. Social media buff. Hipster-friendly travel fanatic. Reader.
Movies                                                                                                                                        0.555556
TV                                                                                                                                                 1.0
Religion                                                                                                                                      0.777778
Music                                                                                                                                         0.222222
Sports                                                                                                                                        0.666667
Books                                                                                         

In [40]:
df.iloc[100]

Bios        Explorer. Typical travel nerd. Unable to type with boxing gloves on. Bacon ninja. Music fan. Professional reader.
Movies                                                                                                               0.888889
TV                                                                                                                        1.0
Religion                                                                                                             0.555556
Music                                                                                                                0.111111
Sports                                                                                                               0.888889
Books                                                                                                                0.444444
Politics                                                                                                              