In [1]:
import datetime
import time

import pandas as pd
import spacy
import re
import string

from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
%matplotlib inline

from spacy.tokens import Token
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from varclushi import VarClusHi
pd.set_option('display.max_rows', 500)

In [2]:
#### Importing the file ####
Path="src/"
Filename='projects_Preprocessed.csv'
df=pd.read_csv(Path+Filename)

Cat_File="category_hier.csv"
Cat_data=pd.read_csv(Path+Cat_File)

## Filtering the null abstracts & short description
df=df[(pd.isnull(df.PreProcessedDescription)==False) & (df.PreProcessedDescription.str.strip()!='abstract available')& (df.PreProcessedDescription.str.len()>100)]

In [3]:
merged_data=df.merge(Cat_data[["File_Categories","Category"]], how="left", left_on="SubjectArea", right_on="File_Categories")

In [4]:
dummies=pd.get_dummies(merged_data['Category'])
merged_data=pd.concat([merged_data,dummies], axis=1,ignore_index=False)

In [5]:
# TF IDF Conversion

vectorizer = TfidfVectorizer(max_features=1000) #ngram_range=(2, 2)
review_vectors = vectorizer.fit_transform(merged_data["PreProcessedDescription"])
features_df = pd.DataFrame(review_vectors.toarray(), columns = vectorizer.get_feature_names())

features_df.reset_index(drop=True, inplace=True)
merged_data.reset_index(drop=True, inplace=True)
merged_data=pd.concat([merged_data,features_df], axis=1,ignore_index=False)

wordslist=merged_data.columns.tolist()[len(df.columns)+2:]


In [6]:
#merged_data[merged_data.columns.tolist()[:len(df.columns)+2]].head()

In [8]:

nrec=60000
maxcluster=8

start = time.time()
print(str(datetime.datetime.now())+" : Started ")

demo1_vc = VarClusHi(merged_data[merged_data.columns.tolist()[len(df.columns)+2:]].head(nrec),maxeigval2=1,maxclus=maxcluster)
demo1_vc.varclus()

print(str(datetime.datetime.now())+" : Completed for "+str(nrec)+" records in "+str(round((time.time() - start)/60,2))+" mins")


2020-01-24 11:59:25.037567 : Started 
2020-01-24 14:13:26.748971 : Completed for 60000 records in 134.03 mins


In [None]:
#demo1_vc.info

In [9]:
#[print(i, w['Variable'].unique) for i,w in demo1_vc.rsquare.groupby('Cluster')]
print(str(datetime.datetime.now())+" : Started ")
rsqresult=demo1_vc.rsquare
print(str(datetime.datetime.now())+" : Completed for "+str(nrec)+" records")

2020-01-24 17:49:27.466055 : Started 
2020-01-24 17:49:54.541474 : Completed for 60000 records


In [None]:
#rsqresult.to_csv(Path+'variable_clusters_v2.csv', index=False)

In [None]:
for i,w in  rsqresult.groupby('Cluster'):
    print(w.sort_values(by='RS_Ratio').head(10))

In [None]:
rsqresult[rsqresult.Variable.isin(dummies.columns.tolist())]

In [None]:
rsqresult[rsqresult['Cluster']==8].sort_values(by='RS_Ratio')

In [None]:
Cluster_info=pd.DataFrame(
{'cluster_id'  :[ '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' , '10', '11', '12', '13', '14', '15']
,'cluster_name':[ 'General' ,'Life Sciences' ,'Humanities and Social Sciences' ,'Engineering Sciences' ,'Natural Sciences' ,'Medicine' ,'Foundation & Acquisition' ,'Chemistry & Mechanical & Electrical' ,'Life Sciences' ,'Physics & Mathematical & Geometry & Analytical & Computer' ,'Eco System & Chemistry' ,'History & Cultural' ,'Climate & Earth' ,'Human & Experiment' ,'Biology & Genetics' ,'Text & Publish' ]
,'category'   :['General', 'Life Sciences', 'Humanities and Social Sciences', 'Engineering Sciences', 'Natural Sciences', 'Life Sciences', 'Uncategorized',  'Uncategorized',  'Life Sciences',  'Uncategorized' , 'Natural Sciences',  'Humanities and Social Sciences',  'Natural Sciences', 'Uncategorized', 'Uncategorized',  'General']
})

In [None]:
cluster_comp_info=demo1_vc.info

In [None]:
cluster_comp_info.merge(Cluster_info, how="left", left_on='Cluster', right_on='cluster_id')

In [None]:
clusters_info=cluster_comp_info.merge(Cluster_info, how="left", left_on='Cluster', right_on='cluster_id')

In [None]:
clusters_info.N_Vars=clusters_info.N_Vars.astype('int32')
clusters_info

In [None]:
clusters_info.groupby(['cluster_name','category']).sum()['N_Vars'].plot(kind='bar', y='cluster_name', x='N_Vars')

In [None]:
clusters_info.groupby('category').sum()['N_Vars'].plot(kind='bar', x='category', y='N_Vars')