In [46]:
from IPython.core.interactiveshell import InteractiveShell
from datetime import datetime

#basic externals
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import feather
from random import randint

#visual libs
import matplotlib.pyplot  as plt
import seaborn as sns

# sklearn model imports
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score, roc_curve, classification_report, confusion_matrix, f1_score, roc_auc_score, jaccard_similarity_score,pairwise_distances
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain
from sklearn.decomposition import LatentDirichletAllocation 

#Text models
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

#local imports
from xm_functions import *


InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 20
pd.options.display.max_columns = 50 
pd.set_option('display.float_format', lambda x: '%.3f' % x) #display numbers as decimals instead of scientific value

plt.rcParams["font.size"] = (18)

### Quickly import our feather clean dataset

In [47]:
df = pd.read_feather('df_transformed_stackoverflow_posts.feather')

In [48]:
df.head(3)
df.shape

Unnamed: 0,title,body,tags,old_tags,body_LancasterStemmer,title_LancasterStemmer,old_body_LancasterStemmer,old_title_LancasterStemmer
0,certain situations require use bitwise operato...,day trying code small c++ programming using sd...,c++,c++ event-handling sdl bitwise-operators boole...,day try cod smal c++ program us sdl libr ran s...,certain situ requir us bitw op instead log eq op,day try cod smal c++ program us sdl multimed l...,certain situ requir us bitw op instead log eq op
1,create channel youtube upload videos possible ...,using https github com youtube yt direct lite ...,ios objective-c,ios objective-c video youtube-data-api youtube...,us https github com youtub direct lit io sampl...,cre channel youtub upload video poss cre chann...,us https github com youtub yt direct lit io sa...,cre channel youtub upload video poss cre chann...
2,design decision boolean containsall collection...,boolean containsall collection c method collec...,java generics,java generics wildcard,bool collect c method collect framework allow ...,design decid bool collect c vs bool addal coll...,bool containsal collect c method collect frame...,design decid bool containsal collect c vs bool...


(43254, 8)

## Apply Pre transform to our posts, titles and tags

In [49]:
posts_tags= df.loc[0:, 'tags']
posts_ids = df.index
nb_posts= posts_ids.shape[0]

### CountVectorizer of tags

In [85]:
min_occurences = round(nb_posts/1000)
# binary ensures repeated tags are counted once
countvector = CountVectorizer(min_df=43, binary =True, token_pattern ='[^\s]+')

# Apply fit_transform 
csrm_y = countvector.fit_transform(posts_tags)

tags_features = pd.Series([word for word in countvector.get_feature_names()])
df_y= pd.DataFrame(csrm_y.toarray(),index=posts_ids, columns=tags_features.values)

nb_tags_distinct=df_y.shape[1]
print('We limit the minmum number of occurences for a tag to',min_occurences)
print(nb_tags_distinct,'distinct tags remaining. (', nb_posts,'posts )')

We limit the minmum number of occurences for a tag to 43
379 distinct tags remaining. ( 43254 posts )


## Correlation matrix, dimension reduction of tags

In [86]:
df_corr = df_y.corr()
df_corr = df_corr.where(~np.tril(np.ones(df_corr.shape)).astype(np.bool))

In [87]:
df_corr_stack = df_corr.stack().reset_index()
df_corr_stack.columns = ['y1','y2','corr']
df_corr_stack = df_corr_stack.sort_values(by = 'corr', kind="quicksort", ascending=False)

In [88]:
tags_frequencies = pd.DataFrame(df_y.astype(bool).sum(axis=0), columns = ['y1_frequency'])
df_corr_stack = df_corr_stack.join(tags_frequencies, on ='y1')
tags_frequencies.columns = ['y2_frequency']
df_corr_stack = df_corr_stack.join(tags_frequencies, on ='y2')

In [167]:
#Orders columns so that y1 is always the bigger tag compared to y2 (in terms of how often it appears)
def order_by_freq(x):
    if x['y1_frequency'] < x['y2_frequency']:
        x['y1'], x['y2'] = x['y2'], x['y1']
        x['y1_frequency'], x['y2_frequency'] = x['y2_frequency'], x['y1_frequency']
    return x


In [224]:
df_corr_stack = df_corr_stack.apply(order_by_freq, axis=1
#Calculates a correlation score which is lower for high frequency items. As we prefer to keep high fre<quency tags and convert low frequency ones
df_corr_stack['freq_corr'] = 100*df_corr_stack['corr']/df_corr_stack['y2_frequency']

In [253]:
#383 = quantile 0.9 , not in top 10% most common tags
df_corr_top = df_corr_stack[(df_corr_stack['freq_corr']>0.055)].sort_values('corr', ascending=False)
print(df_corr_top.shape[0], 'tags associations selected')
print('Exemple of worst associations ')
df_corr_top.tail(7)

868 tags associations selected


Unnamed: 0,y1,y2,corr,y1_frequency,y2_frequency,relative_corr,freq_corr
6290,android-studio,android-recyclerview,0.025,266,44,0.088,0.058
808,class,abstract-class,0.025,267,44,0.087,0.057
19642,class,const,0.025,267,44,0.087,0.057
27045,vb.net,data-binding,0.025,271,44,0.087,0.057
8210,apache,https,0.025,128,44,0.068,0.057
10351,arrays,iteration,0.025,1269,44,0.156,0.056
929,java,abstract-class,0.024,5582,44,0.3,0.056


In [266]:
final_corr_set = set(df_corr_top['y1']) - set(df_corr_top['y2'])
print(len(final_corr_set), 'max tags in final list. Reduced from', df_corr_top.shape[0], 'similar tags')
df_corr_top = df_corr_top.sort_values('y1_frequency', ascending=False).reset_index()

86 max tags in final list. Reduced from 868 similar tags


In [260]:
#Here i will assoicate each secondary tag with a parent.
#If the parent is itself secondary, i will assocate it with the grand-parent, etc.. Which explains this complicated loop

df_final_tags = pd.DataFrame()
df_temp_corr = df_corr_top.copy()
parent_defining_col = 'corr'
i = 0
while df_temp_corr.shape[0] > 0:
    i+=1
    if i >10 :
        print('Error infinite loop, abort')
        exit
    secondary_tags = set(df_temp_corr['y2']) - set(df_temp_corr['y1'])
    print(len(secondary_tags), 'tags to treat')
    
    for secondary_tag in secondary_tags :
        parent = df_temp_corr[df_temp_corr.y2 == secondary_tag].sort_values(parent_defining_col, ascending=False).head(1)['y1'].values[0]
        if parent in final_corr_set:
            df_final_tags = df_final_tags.append([[parent, secondary_tag]] )
            df_temp_corr.drop(df_temp_corr[df_temp_corr.y2 == secondary_tag].index, inplace=True)
        else:
            df_temp_corr.loc[:, 'y1'][df_temp_corr.y2 == secondary_tag] = df_temp_corr[df_temp_corr.y2 == parent].sort_values(parent_defining_col, ascending=False).head(1)['y1'].values[0]
            print(secondary_tag, 'parent changed to grand-parent : ', parent)
    print(df_temp_corr.shape[0], 'tags left to sort')

df_final_tags = pd.DataFrame(df_final_tags[0].values, index = df_final_tags[1], columns=['replace_with'])
print(df_final_tags.shape[0] , 'secondary tags associated with ', len(df_final_tags.iloc[:,0].unique()), 'parents.')

110 tags to treat
flex parent changed to grand-parent :  actionscript-3
logging parent changed to grand-parent :  exception-handling
laravel-5 parent changed to grand-parent :  laravel


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


data-binding parent changed to grand-parent :  mvvm
netbeans parent changed to grand-parent :  swing
transactions parent changed to grand-parent :  hibernate
machine-learning parent changed to grand-parent :  numpy
frameworks parent changed to grand-parent :  model-view-controller
dataframe parent changed to grand-parent :  pandas
cmd parent changed to grand-parent :  batch-file
clojure parent changed to grand-parent :  functional-programming
razor parent changed to grand-parent :  asp.net-mvc-3
layout parent changed to grand-parent :  mobile
const parent changed to grand-parent :  reference
grails parent changed to grand-parent :  intellij-idea
iteration parent changed to grand-parent :  foreach
utf-8 parent changed to grand-parent :  unicode
ms-access parent changed to grand-parent :  vba
vim parent changed to grand-parent :  grep
uiview parent changed to grand-parent :  uiviewcontroller
plsql parent changed to grand-parent :  oracle
ant parent changed to grand-parent :  build
web-se

In [272]:
df_display = pd.DataFrame(index = df_y.columns)
df_display = df_display.join(df_final_tags)
df_children_count = df_final_tags.reset_index().groupby('replace_with').count()
df_children_count.columns=['children_count']
df_display = df_display.join(df_children_count)
pd.set_option('display.max_rows', 500)


In [275]:
df_display[df_display.index.str.contains('visual-studio') ]

Unnamed: 0,replace_with,children_count
visual-studio,,12.0
visual-studio-2008,visual-studio,
visual-studio-2010,,
visual-studio-2012,visual-studio,
visual-studio-2013,visual-studio,
visual-studio-2015,visual-studio,


In [276]:
df_display[df_display.index.str.contains('asp.net') ]

Unnamed: 0,replace_with,children_count
asp.net,,2.0
asp.net-core,visual-studio,
asp.net-mvc,,6.0
asp.net-mvc-3,asp.net-mvc,
asp.net-mvc-4,asp.net-mvc,
asp.net-web-api,asp.net-mvc,


In [286]:
print(df_display[df_display.replace_with.isna() & (df_display.children_count.isna())].shape[0], 'tags with no children and no parents')
print(df_display[df_display.replace_with.isna() & ~(df_display.children_count.isna())].shape[0], 'parent tags')
print(df_display[df_display.replace_with.isna() ].shape[0], 'tags total')

23 tags with no children and no parents
72 parent tags
95 tags total


## Exporting our tags' list

In [283]:
df_final_tags.to_csv('tags_to_replace.csv', index=True)
df_final_tags.to_csv('OCR_Project6_StackOverFlow/API/ressources/tags_to_replace.csv', index=True)
print('export done')

export done
