In [5]:
import pandas as pd
from IPython.core.display import HTML

#path = "C:\Users\Ali"

versions = pd.read_csv("KernelVersions.csv")
kernels = pd.read_csv("Kernels.csv")
users = pd.read_csv("Users.csv")

def pressence_check(title, tokens):
    present = False
    for token in tokens:
        words = token.split()
        if all(wrd.lower().strip() in title.lower() for wrd in words):
            present = True
    return present 
    
def get_kernels(tokens, n):
    versions['isRel'] = versions['Title'].apply(lambda x : pressence_check(x, tokens))
    relevant = versions[versions['isRel'] == 1]
    relevant = relevant.groupby('KernelId').agg({'TotalVotes' : 'sum', 'Title' : lambda x : "#".join(x).split("#")[0]})
    results = relevant.reset_index().sort_values('TotalVotes', ascending = False).head(n)
    results = results.rename(columns={'KernelId' : 'Id', 'TotalVotes': 'Votes'})
    results = results.merge(kernels, on="Id").sort_values('TotalVotes', ascending = False)
    results = results.merge(users.rename(columns={'Id':"AuthorUserId"}), on='AuthorUserId')
    return results[['Title', 'CurrentUrlSlug', 'TotalViews', 'TotalComments', 'TotalVotes', "DisplayName","UserName"]]


def best_kernels(tokens, n = 10):
    response = get_kernels(tokens, n)     
    hs = """<style>
                .rendered_html tr {font-size: 12px; text-align: left}
            </style>
            <h3><font color="#1768ea">"""+tokens[0].title()+"""</font></h3>
            <table>
            <th>
                <td><b>Kernel Title</b></td>
                <td><b>Author</b></td>
                <td><b>Total Views</b></td>
                <td><b>Total Comments</b></td>
                <td><b>Total Votes</b></td>
            </th>"""
    for i, row in response.iterrows():
        url = "https://www.kaggle.com/"+row['UserName']+"/"+row['CurrentUrlSlug']
        aurl= "https://www.kaggle.com/"+row['UserName']
        hs += """<tr>
                    <td>"""+str(i+1)+"""</td>
                    <td><a href="""+url+""" target="_blank"><b>"""  + row['Title'] + """</b></a></td>
                    <td><a href="""+aurl+""" target="_blank">"""  + row['DisplayName'] + """</a></td>
                    <td>"""+str(row['TotalViews'])+"""</td>
                    <td>"""+str(row['TotalComments'])+"""</td>
                    <td>"""+str(row['TotalVotes'])+"""</td>
                    </tr>"""
    hs += "</table>"
    display(HTML(hs))

# Data Science Glossary on Kaggle

Kaggle is the place to do data science projects. There are so many algorithms and concepts to learn. Kaggle Kernels are one of the best resources on internet to understand the practical implementation of algorithms. However there are almost 200,000 kernels published on kaggle and sometimes it becomes diffcult to search for the right implementation. 

Recently, Kaggle team updated the [Meta Kaggle](https://www.kaggle.com/kaggle/meta-kaggle) database and I am using it to collate the best kernels by different topics, and finally creating a glossary of machine learning, natural language processing algorithms shared on kaggle kernels. One can use this kernel as the one place to find other great kernels shared by great authors. Hope you like this kernel. 

## 1. Regression Algorithms


In [6]:
tokens = ["linear regression"]
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Price analysis and Linear Regression,Tony Pino,7891,10,42
2,Linear Regression to predict Market Value,ShubhamMaurya,7821,7,28
3,Simple Linear Regression,zohan,5192,8,21
4,Health Care Cost Prediction w/ Linear Regression,def me(x),1519,4,20
5,Predictions with XGboost and Linear Regression,MuhammetBurakErgenc,15767,2,18
6,Simple Linear Regression (0.0648835),JT,3115,12,17
7,In-Depth Simple Linear Regression,Nick Brooks,1027,5,15
8,Linear regression,Achal,4348,4,14
9,Simple One Feature Linear Regression,Ariadne,2023,8,12
10,Cereal ratings and linear regressions,Matthew Brachmann,1285,3,11


In [7]:
tokens = ['logistic regression', "logistic"]
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Logistic regression with words and char n-grams,Bojan Tunguz,29154,95,345
2,Logistic regression with words and char n-grams,thousandvoices,5699,14,79
3,Logistic Regression and ROC Curve Primer,Troy Walters,9489,18,79
4,Titanic Analysis with SVM+RF+DT+Logistic Reg,swamysm,13603,39,78
5,Example: Attacking logistic regression,Allunia,8056,5,77
6,Bayesian Logistic Regression with rstanarm,Aki Vehtari,18552,15,55
7,Simple logistic model LB(0.231),Sudhir Kumar,4067,25,49
8,Logistic Regression TFIDF,Sudhir Kumar,5426,8,32
9,Logistic of Genetic Features,Andy Harless,2749,11,35
10,Starter Logistic Regression in R,mlandry,7130,0,34


In [8]:
tokens = ['Stepwise regression']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Decision Tree and Stepwise AIC regression,VshnuVardhan,347,0,1
2,Stepwise linear regression,Praveen Hegde,2070,5,0
3,Stepwise logistic regression Titanic survival,ramoes,408,0,0
4,Titanic using stepwise linear regression,Paulo Cressoni,252,0,0
5,Stepwise logistic regression - AUC 98%,WeetJeData,199,0,0


In [9]:
tokens = ['polynomial regression']
best_kernels(tokens, 5)

0,1,2,3,4,5
1,Polynomial Regression|Adj R Sq=0.83|Acc=0.85,zohan,2428,11,15
2,Pumpkin Price Polynomial Regression,Aleksey Bilogur,641,1,5
3,Basic Polynomial Regression - India,Siddharth Nishtala,748,0,2
4,Lesson 1 - Linear (and Polynomial) Regression,Edoardo Ferrante,159,0,2
5,Polynomial Regression,humingtao,70,0,0


In [10]:
tokens = ['multivariate regression']
best_kernels(tokens, 5)

0,1,2,3,4,5
1,Multivariate Logistic Regression,Kaan Can,3387,43,109
2,MultiVariate Adaptive Regression Spline,ozcan,400,1,1
3,Multivariate regression,Andres Hernandez,254,0,0
4,Multivariate Logistic Regression from scratch,Rakend Dubba,10,0,0


## 2. Regularization Algorithms

In [11]:
tokens = ['Ridge']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Ridge (LB 0.41944),Serg Lavrikov,6060,34,155
2,Ridge Script,Alexandru Papiu,5383,24,122
3,Mercari RNN + 2Ridge models with notes,Patrick DeKelly,7007,14,101
4,More Effective Ridge LGBM Script,Bojan Tunguz,8587,24,94
5,avito_LightGBM with Ridge Feature,Himanshu Chaudhary,9582,25,85
6,LightGBM with Ridge Feature,Dan Emery,2934,9,55
7,Ridge Test,Yunfeng Zhu,10659,14,54
8,Wordbatch+Ridge + FM_FRTL + Target Encoding + LGBM,Samrat P,2475,14,48
9,Fork of avito_LightGBM with Ridge Feature V 3.0,Samrat P,4350,7,45
10,"Revision of ""Wordbatch+Ridge+FM_FRTL""",Peter Hurford,1573,7,42


In [12]:
tokens = ['Lasso']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,House prices: Lasso regression and a detailed EDA,Erik Bruin,24658,112,322
2,XGBoost + Lasso,Human Analog,24166,24,104
3,Lasso model for regression problem,Boris Klyus,10373,33,57
4,Titanic: Lasso/Ridge Implementation,Bisaria,6194,9,23
5,You got this!!!! Feature Engineering and Lasso,SarthakYadav,3982,11,21
6,Lasso + GBM +XGBOOST - Top 20% using R,Aniruddha Chakraborty,6008,8,19
7,XGboost + Ridge + Lasso,Julien Heiduk,13146,2,19
8,Lasso model for regression problem,Mohammed Amro,1585,8,15
9,Top 20% - Interpretable Solution using Lasso,Telmo Felgueira,390,3,9
10,FS(Lasso)+HyperParamTuning(HyperOpt),Abhilash Awasthi,430,0,9


In [13]:
tokens = ['ElasticNet']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,TruncatedSVD & ElasticNet,Ishaan Jain,3220,19,36
2,Top 7% using ElasticNet with Interactions,Jack Roberts,1593,14,30
3,ElasticNet (LB0.547) + feature importance,den3b,1350,6,20
4,"Stack of SVM,ElasticNet,XGBoost,RF,ET // ~ 0.554",Eike Dehling,2969,15,19
5,House Price predict score 0.14205 by ElasticNet,JuHyung,284,1,3
6,Lasso & ElasticNet,TahianaLuciaRAMORASAHASAMBATRA,290,0,3
7,ElasticNet & XGBoost Ensemble,Dominic DeBiaso,287,0,2
8,Ridge vs Lasso vs ElasticNet,JamesLawlor,275,0,1
9,"Lasso, Ridge and ElasticNet comparison",Dmitry,204,0,0
10,Twosigma PG + ElasticNetCV and ID level avg,Akila Wajirasena,139,1,0


## 3. Tree Based Models

In [14]:
tokens = ['Decision Tree']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Decision Tree Visualization & Submission,Arda Yildirim,69893,30,127
2,Explore & Explain: Density & Decision Trees,msjgriffiths,11559,19,87
3,Simple decision tree for the Titanic dataset,Diego Milla,16420,16,31
4,Prediction - Decision Tree & Neural Network,Sheik Mohamed Imran,1126,6,16
5,Voice Data - Gender Prediction (Decision Tree),Sheik Mohamed Imran,418,0,7
6,Decision Trees for Binary Classification (0.99),paultimothymooney,1308,2,16
7,Topic 3. Decision Trees and kNN,Yury Kashnitsky,89,0,11
8,Predicting with Decision Tree,Data Framed,708,4,8
9,Simple Decision Tree Model for Beginners,Arnab,722,0,8
10,Stephen Curry's Decision Tree,DrGuillermo,836,0,8


In [17]:
tokens = ['random forest']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Random Forests,DanB,44996,2,482
2,Quick & Dirty RandomForest,Megan Risdal,37304,36,191
3,Random Forest Example (R),Ben Hamner,112860,54,173
4,Random Forest Benchmark,Ben Hamner,52013,23,90
5,Random Forest on a Few Blocks,Alexandru Papiu,28282,36,159
6,"Feature Ranking w RandomForest, RFE, linear models",Anisotropic,20589,32,159
7,Random Forest Starter with numerical feature,Li Li,13149,18,158
8,Titanic Random Forest: 82.78%,ZlatanKremonic,4540,24,75
9,Random Forest Example,mlandry,38406,27,71
10,Random forest using elemental properties,Chris Bartel,3236,4,66


In [16]:
tokens = ['lightgbm', 'light gbm', 'lgb']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,LightGBM with Feature Engineering,Aguiar,16697,78,258
2,LightGBM (Fixing unbalanced data),Pranav Pandya,17125,84,172
3,Aggregated features & LightGBM,Benjamin Minixhofer,8712,31,168
4,"preprocessing, model averaging by xgb + lgb [1.40]",Alex,8309,66,157
5,"EDA, feature engineering and xgb + lgb",Andrew Lukyanenko,2134,7,135
6,"1st Place LGB Model(public:0.470, private:0.502)",piupiu,7691,77,133
7,non-blending lightGBM model LB: 0.977,Baris Kanber,11832,113,128
8,reorders with light GBM,paulantoine,19164,58,127
9,Kaggle-runnable version of Baris Kanber's LightGBM,Md Asraful Kabir,12328,45,124
10,LightGBM using weighted averages and dropout,James Shepherd,8611,35,121


In [18]:
tokens = ['xgboost', 'xgb']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Data Analysis & XGBoost Starter,anokas,71305,132,900
2,Simple XGBoost Starter (~0.0655),anokas,21334,36,171
3,"House prices: Lasso, XGBoost, and a detailed EDA",Erik Bruin,24658,112,322
4,Learning to Use XGBoost,DanB,33690,2,288
5,Understanding XGBoost Model on Otto Data,Tianqi Chen,108210,16,261
6,XGB CV,Andy Harless,23265,67,227
7,XGBoost Starter - LB,João Pedro Peinado,21571,85,186
8,mxnet + xgboost baseline [LB: 0.57],n01z3,21824,48,163
9,"preprocessing, model averaging by xgb + lgb [1.40]",Alex,8309,66,157
10,XGBoost Starter - LB 0.3791,Fabienvs,20293,65,154


In [19]:
tokens = ['catboost']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,LightGBM + XGBoost + Catboost,Samrat P,7541,15,108
2,"Stacking Test (Sklearn, XGBoost, CatBoost)",Eliot Barril,4783,20,74
3,Simple CatBoost,Nick Brooks,3590,10,60
4,Concise catboost starter ensemble (PLB: 0.06435),See--,4370,16,56
5,"CatBoost, StackedAE with MXNet, Meta",Tanrei(nama),2517,12,46
6,CatBooStarter,Vladimir Demidov,2722,2,45
7,Simple CatBoost,HaimFeldman,3672,16,34
8,Naive CatBoost,Bruno G. do Amaral,2109,1,27
9,CatBoost Starter,Bojan Tunguz,1472,13,24
10,CatBoost CV,Andy Harless,3386,7,24


In [20]:
tokens = ['adaboost']
best_kernels(tokens, 5)

0,1,2,3,4,5
1,adaboost,ALPHA_TX,1177,0,3
2,adaboost valid score,ALPHA_TX,322,0,2
3,Prediction with AdaBoost Algorithm,Lyuxun Yang,453,1,3
4,Classification using AdaBoost,medakk,997,5,2
5,"Bike-share-Feature selectn/engg,KNN,RF,dt+adaboost",SANDY,115,0,2


## 4. Neural Networks and Deep Learning

In [21]:
tokens = ['neural network']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Neural Network starter for recognising digits,Poonam Ligade,37619,89,283
2,Neural Network Approach,Claire Longo,9884,24,67
3,Random Forest vs XGBoost vs Deep Neural Network,Amandeep Rathee,14246,3,64
4,A Neural Network Model for House Prices,Julien Heiduk,15530,13,57
5,Surprise Me 2! Neural Networks(keras),NitinSurya,7044,12,53
6,Build your own neural network in R,JunMa,12131,14,52
7,Neural Network using Stochastic Gradient Descent,Jean Carlo Codogno,6163,43,42
8,Recurrent Neural Network with Pytorch,Kaan Can,795,10,40
9,3D Convolutional Neural Network w/o Programming,DeepMan,7502,12,35
10,Embedding with Neural Network,spongebob,1966,4,32


In [22]:
tokens = ['backpropagation']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,An introduction to backpropagation,Romain,199,1,4
2,Using normal backpropagation to predict,mrbeen25,60,0,0
3,Backpropagation,Saurabh,23,0,0


In [23]:
tokens = ['autoencoder']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Manifold Learning with Autoencoders,Alexandru Papiu,5057,8,45
2,Autoencoder and Deep Features,Loic Merckel,2803,6,31
3,H2O - Autoencoders and anomaly detection,Sheik Mohamed Imran,4791,2,21
4,2D Visualization: PCA vs Autoencoders,den3b,1668,6,16
5,Denoising: Autoencoders to the rescue!!,NAIN,995,5,15
6,Simple denoise autoencoder with Keras,Roberto Spadim,402,15,13
7,Denoising Autoencoder,OsciiArt,1042,1,13
8,1. Autoencoder with Keras,zihaox,4485,5,13
9,Visualizing MNIST using a Variational Autoencoder,Rebecca Vislay Wade,1594,0,11
10,Keras AutoEncoder with simple CNN,atom1231,1923,10,10


In [24]:
tokens = ['deep learning']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,DEEP LEARNING,Kaan Can,26251,137,353
2,Intro to Computer Vision for Deep Learning-WIP,DanB,54263,33,194
3,A Deeper Understanding of Deep Learning,DanB,8640,5,117
4,Rectified Linear Units (ReLU) in Deep Learning,DanB,6836,3,59
5,Deep learning support,Alexander Kireev,12165,94,139
6,Welcome to Deep Learning (99% CNN),Peter Grenholm,18565,44,132
7,Deep learning with TensorFlow,James Shepherd,7643,49,119
8,EDA Recommender SystemDeep LearningModel Intuition,Badal Gupta,2473,32,105
9,Deep Learning,Umberto,14370,46,78
10,Starting Kit for PyTorch Deep Learning,Mamy Ratsimbazafy,13268,17,61


In [25]:
tokens = ['convolutional neural networks', 'cnn']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Digits Recognizer - Keras CNN - 0.997 (top 8%),Yassine Ghouzam,48796,210,1052
2,Keras CNN with 99.5% accuracy,Peter Grenholm,18565,44,132
3,Transfer Learning with VGG-16 CNN+AUG LB 0.1725,DeveshMaheshwari,13356,79,118
4,Keras CNN - StatOil Iceberg LB 0.1995,TheGruffalo,9656,72,90
5,Digit recognizer in Python using CNN,Koba,39251,19,84
6,CNN with Keras,bgo,12344,17,79
7,Bi-GRU-CNN-Poolings with FastText,MengYe,5861,24,73
8,Breast Cancer Image Class. CNN 80% Valid. Acc.,Raoul,5555,8,64
9,"1D CNN (single model score: 0.14, 0.16 or 0.23)",Alex,5809,21,60
10,TextCNN (2D Convolution),Vladimir Demidov,4436,8,58


In [26]:
tokens = ['lstm']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,LSTM with fine-tuned word2vec embeddings,lystdo,50162,170,205
2,[ LB 0.18+ ] LSTM with GloVe and magic features,lystdo,7185,75,70
3,Improved LSTM baseline (GloVe + dropout),Jeremy Howard,18180,34,203
4,Minimal LSTM + NB-SVM baseline ensemble,Jeremy Howard,14260,14,155
5,Keras - Bidirectional LSTM baseline,CVxTz,19363,41,197
6,Bidirectional LSTM with Convolution,Ashish Gupta,9897,44,86
7,"keras lstm attention glove840b,lb 0.043",qianqian,8264,18,80
8,"Basic NLP: Bag of Words, TF-IDF, Word2Vec, LSTM",ReiiNakano,16837,18,68
9,Explore TS with LSTM,Vladimir Demidov,4669,14,61
10,Fork of LSTM_Stock_prediction-20170419,BenF,20246,36,57


In [27]:
tokens = ['gru']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Pooled GRU,Vladimir Demidov,20935,38,220
2,Capsule net with GRU,chongjiujjin,10086,33,147
3,Grupo Bimbo data analysis,Fabienvs,18630,20,116
4,"RNN_GRU_with_Keras(512-64,relu)_0.44025 c325a2",Kostiantyn Isaienkov,10891,8,85
5,Bi-GRU-CNN-Poolings with FastText,MengYe,5861,24,73
6,(How to get 81%) GRU-ATT + LGBM + TF-IDF + EDA,Peter,2685,30,69
7,LGB + GRU + LR + LSTM + NB-SVM Average Ensemble,Peter Hurford,7189,13,53
8,Pooled GRU (with preprocessing),Prashant Kikani,3764,20,46
9,"RNN_GRU_with_Keras(512-64,relu)_0.44694",yyll008,5008,12,44
10,NY Stock Price Prediction RNN LSTM GRU,Raoul,12754,12,41


In [28]:
tokens = ['mxnet']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,mxnet + xgboost baseline [LB: 0.57],n01z3,21824,48,163
2,mxnet + xgboost simple solution,n01z3,16017,24,57
3,"CatBoost, StackedAE with MXNet, Meta",Tanrei(nama),2517,12,46
4,R Mxnet convnet simple tutorial,miguel perez,15095,1,17
5,R & MXNET,gmilosev,8581,17,13
6,XGBoost + mxnet in R,Paweł Romański,5110,13,11
7,Digit Recognizer - Using Mxnet 2,BlastChar,118,0,4
8,MXNET with R starter kit,jeremie_db,2798,5,4
9,MXnet deep NN,Tornadozou,1664,0,3
10,mxnet // cnn_1d 0.945 acc [FULL-SET],Lefteris Fanioudakis,36,0,2


In [29]:
tokens = ['resnet']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Image Classification and Quality Score w/ ResNet50,Wesam Elshamy,6166,34,122
2,EDA and CNN (resnet-18) (LB 0.2094),Prince Grover,2781,7,35
3,Complete process using ResNet as a starting point,Rodney Thomas,5234,28,31
4,End-to-End ResNet50 with TTA [LB ~0.93],Sasha Korekov,1911,10,31
5,Objects + Bounding Boxes using Resnet50 - ImageAI,sban,1545,9,22
6,resnet50 features + xgboost,n01z3,7297,4,19
7,Keras ResNet with image augmentation,AndreasFalkoven,1418,1,17
8,ResNet50 Exam,beluga,2702,3,12
9,Feature Extraction by ResNet (keras,Chia-Ta Tsai,1788,8,11
10,resnet50 features + xgboost,Travis Glines,2323,2,10


In [30]:
tokens = ['Capsule network', 'capsulenet']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Beginner's Guide to CapsuleNet,Zafar,10514,52,298
2,CapsuleNet on MNIST,Kevin Mader,29589,21,142
3,CapsuleNet on Fashion MNIST,Kevin Mader,4279,0,43
4,CapsuleNet on Lung Nodules,Kevin Mader,728,2,5
5,All you need to know about Capsule Networks,AnkitJha,365,8,11
6,Capsule Networks on Description,Chin Ee Kin,495,2,9
7,Lets try CapsuleNet,Jason Benner,1210,4,8
8,Beginner's Guide to Capsule Networks,AshishPatel,41,0,3
9,CapsuleNetwork for Russian Characters,intrepidnomad,76,0,1


## 5. Clustering Algorithms 

In [31]:
tokens = ['kmeans', 'k means']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,PCA visualisations with KMeans clustering,Anisotropic,46034,55,201
2,Log MA and Days of Week Means (LB: 0.534),Paulo Pinto,10438,31,143
3,Aggregates + SumValues + SumZeros + K-Means + PCA,Samrat P,3054,20,43
4,Clustering wines with k-means,Xavier,2199,12,42
5,Visualizing k-means with Leaf Dataset,Selfish Gene,4238,1,36
6,3D Kmeans animation,DrGuillermo,1031,3,15
7,kmeans_example,kajot,6366,27,12
8,K-means Clustering of 1 million headlines,Siddharth Yadav,644,6,11
9,Simple K-means clustering on the Iris dataset,Tim I,7593,1,11
10,Using K-Means Clustering to Predict Helpfulness,Amee Amin,6952,6,10


In [32]:
tokens = ['hierarchical clustering']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Detecting groups with Hierarchical Clustering,Marcel Spitzer,1719,4,19
2,Hierarchical Clustering vs. k-Means,Ralph Schlosser,2695,2,2
3,Hierarchical Clustering for Iris dataset,Chakra,388,0,1
4,Hierarchical and k-means clustering,Vahe Shelunts,165,0,1
5,IRIS Clustering with K-means & Hierarchical,shan,625,0,0
6,Hierarchical Clustering Mean-Link,Tarun Sunkaraneni,61,0,0
7,Hierarchical Clustering Complete-Link,Tarun Sunkaraneni,259,0,0
8,Hierarchical and Point-Assignment Clustering,Tarun Sunkaraneni,754,0,0
9,Hierarchical Clustering to cluster courses,Kanika Narang,314,0,0
10,Hierarchical Clustering - Dendogram,Febi Agil Ifdillah,263,0,0


In [33]:
tokens = ['dbscan']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,mod DBSCAN 0.3472,Grzegorz Sionkowski,7684,62,94
2,DBSCAN,Mikhail Hushchyn,4617,21,49
3,HDBSCAN clustering,Luis Andre Dutra e Silva,1769,4,38
4,DBSCAN Benchmark 6cd3a5,Yair Beer,1014,8,24
5,Chocolate ratings-Outlier analysis with DBScan,Teza,1525,0,21
6,DBSCAN for CERN,Byfone,2196,4,20
7,HDBSCAN clustering (Inspired by...),the1owl,841,0,15
8,"Creating a submission, validation, starter DBSCAN",Robert Tacbad,705,2,13
9,classifier+optimalhdbscan+helix,Siddhartha,807,3,10
10,HDBSCAN clustering II,Alexander Zinovev,668,3,9


## 6. Misc 

In [34]:
tokens = ['naive bayes']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Naive Bayesian Network with 7 features,J.Jones,1691,8,22
2,NCI Thesaurus & Naive Bayes,Loic Merckel,2309,8,17
3,Benouilli Naive Bayes,Scirpus,5020,18,13
4,Credit Card Fraud Detection: KNN & Naive Bayes,Yura Shakhnazaryan,1917,3,9
5,Fraud Detection with Naive Bayes Classifier,Lovedeep Saini,2930,1,8
6,Naive Bayes and Top1 only,Kate,3047,1,8
7,Spooky Simple Naive Bayes Scores ~0.399,Tom Nelson,561,2,8
8,Naive Bayes without a ML Library,Ryder,347,1,8
9,Simple Naive Bayes classifier,Sudhir Kumar,1034,4,8
10,Sentiment Analysis using LR & Naive Bayes,Megabus,2332,2,7


In [35]:
tokens = ['svm']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,NB-SVM baseline (0.06 lb),Jeremy Howard,47468,102,580
2,Minimal LSTM + NB-SVM baseline ensemble,Jeremy Howard,14260,14,155
3,Evaluating SVM vs KNN on Iris Data,Gabriel Kerr,11708,17,79
4,Titanic Analysis with SVM+KNN+RF+Decision Tree,swamysm,13603,39,78
5,"Fraud Detection by Random Forest,DT and SVM",swamysm,3300,5,32
6,LGB + GRU + LR + LSTM + NB-SVM Average Ensemble,Peter Hurford,7189,13,53
7,Whats Cooking : TF IDF with OvR SVM,sban,1131,19,32
8,Minimal LSTM + NB-SVM baseline ensemble (lb 0.044),Ivan,6393,11,31
9,"Wine Quality EDA, DT, RF, SVM, xgboost and h2o",Owen Ouyang,1454,17,28
10,extra engineered features w/ SVM,rerock,4047,0,25


In [36]:
tokens = ['ensemble']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Titanic Top 4% with ensemble modeling,Yassine Ghouzam,31832,139,562
2,Detailed Data Analysis & Ensemble Modeling,Tanner Carbonati,28263,75,235
3,Ensemble Model: Stacked Model Example,JMT5802,44137,69,163
4,Minimal LSTM + NB-SVM baseline ensemble,Jeremy Howard,14260,14,155
5,EDA & Ensemble Model (Top 10 Percentile),Vivek Srinivasan,18129,28,123
6,Porto Seguro Tutorial: end-to-end ensemble,Yifan Xie,3341,6,86
7,Concise catboost starter ensemble (PLB: 0.06435),See--,4370,16,56
8,ML-Ensemble: Scikit-learn style ensemble learning,flnr,8481,33,55
9,LGB + GRU + LR + LSTM + NB-SVM Average Ensemble,Peter Hurford,7189,13,53
10,Statoil CSV PyTorch ensemble LB 0.1690,QuantScientist,7501,43,48


In [37]:
tokens = ['stacking', 'stack']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Introduction to Ensembling/Stacking in Python,Anisotropic,221551,504,1994
2,Stacked Regressions : Top 5%,Serigne,83283,293,1368
3,Min Max Stacking Starter (LB),DSEverything,12942,25,166
4,Ensemble Model: Stacked Model Example,JMT5802,44137,69,163
5,Simple Stacker LB 0.284,Vladimir Demidov,14883,77,153
6,stacked,Hakeem,19673,55,126
7,Stacking Starter,Faron,23958,27,125
8,Deep analysis of stackoverflow survey 2018,RanjeetJain,2905,40,120
9,OOF stacking regime,Håkon Hapnes Strand,7112,50,119
10,Simple Linear Stacking,Andy Harless,8523,23,118


In [38]:
tokens = ['feature engineering']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Feature Engineering & Importance Testing,NanoMathias,17652,104,286
2,Introduction to Manual Feature Engineering,Will Koehrsen,7496,28,173
3,Automated Feature Engineering Basics,Will Koehrsen,4662,15,83
4,"EDA, feature engineering and xgb + lgb",Andrew Lukyanenko,2134,7,135
5,Spooky Author - Feature Engineering,BuryBuryZymon,5608,28,130
6,Advanced Feature Engineering,Marcel Spitzer,11399,32,111
7,Feature engineering,Eike Dehling,5665,22,95
8,Feature Engineering & Validation Strategy,SRK,6338,25,86
9,HOME CREDIT - BUREAU DATA - FEATURE ENGINEERING,Shanth,3969,25,78
10,Feature engineering 1: Sentiment analysis,Cro-Magnon,5598,18,73


In [39]:
tokens = ['feature selection']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Feature Selection and Data Visualization,Kaan Can,36157,185,447
2,Feature Selection with Null Importances,olivier,4459,42,118
3,An exploratory study on feature selection,Santhosh Sharma Ananthramu,10968,8,83
4,Feature Selection,Will Koehrsen,3692,22,82
5,Feature Selection and Prediction,ZhiboYang,11913,28,54
6,6 Ways for Feature Selection,oskird,2072,15,50
7,Easy Feature Selection pipeline: 0.55+ at LB,Arseny Kravchenko,1825,5,32
8,House Prices: Feature Selection and XGBoost,Li-Yen Hsu,3414,6,20
9,Comparison of Model-based feature selection,Jason Liu,3968,0,19
10,Using XGBoost For Feature Selection,MeiChengShih,5028,6,17


In [40]:
tokens = ['cross validation']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Cross-Validation,DanB,16003,27,103
2,"Cross-validation, weighted linear blending, errors",Tilii,3052,19,60
3,Correct time-aware cross-validation scheme,Yury Kashnitsky,1396,2,30
4,Manager Skill for Cross-Validation Pipelines,Maximilian Hahn,3007,18,26
5,Cross-validation methodology,BreakfastPirate,1182,2,24
6,Simple Grasp Cross-validation,Alexandre Barachant,2525,2,13
7,Training set split for cross validation,Yifan Xie,4796,13,12
8,Proper Cross-Validation,Stergios,2717,5,12
9,Simple Keras Model with k-fold cross validation,Stefanie04736,7582,1,9
10,"Data Augmentation, Cross-Validation",Sharif Amit,412,5,9


In [41]:
tokens = ['model selection']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Quora Interactive EDA & Model selection,Philipp Schmidt,21705,35,212
2,Cervix EDA & Model selection,Philipp Schmidt,19727,34,159
3,Monte Carlo Model Selection,ForzzeeTeam,2391,7,45
4,Feature Selection and Ensemble of 5 Models,Li-Yen Hsu,3414,6,20
5,Comparison of Model-based feature selection,Jason Liu,3968,0,19
6,Model and feature selection with Python,Sergio Rodrigues,4975,1,15
7,Cold Calls: Data Mining and Model Selection,Emma Ren,2006,3,14
8,In Depth Model Selection,dataWrangler,745,0,5
9,EDA and Model Selection,Amol Mavuduru,170,0,3
10,[.997 Acc] Model Selection + Hyperparameter Tuning,Quan Nguyen,503,0,2


In [42]:
tokens = ['smote']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Fraud Detector || Random UnderSample vs SMOTE,Alexander Bachmann,4261,57,79
2,Exploring Fraud Transactions + SMOTE Pipeline,Leonardo Ferreira,1357,8,25
3,Fraud Detection with SMOTE and XGBoost,Bono,2292,1,10
4,Fraud detection with SMOTE and RandomForest,Christophe Taret,4258,4,9
5,SMOTE in R using Treebag,ML_Enthusiast,1975,0,7
6,Predict Product Backorders with SMOTE and RF,HaimFeldman,1700,3,6
7,SMOTE Classify,AkhiyarWaladi,350,1,5
8,"Clustering for Resampling(better than SMOTE, ROSE)",SanjayKumarM,2375,0,5
9,SMOTE with Imbalance Data,Lving,2589,4,5
10,Recall is 1 and AUC is 0.9998 by SMOTE and RF,YiChi,567,0,4


## 7. ML Tools

In [43]:
tokens = ['scikit']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Your First Scikit-Learn Model,DanB,77663,213,591
2,Scikit-Learn ML from Start to Finish,Jeff Delaney,66334,79,272
3,10 Classifier Showdown in Scikit-Learn,Jeff Delaney,28120,57,205
4,ML-Ensemble: Scikit-learn style ensemble learning,flnr,8481,33,55
5,Scikit-Learn ML from Start to Finish,Rajat Shah,3547,8,29
6,grid search xgboost with scikit-learn,Kazuaki Tanida,31631,8,22
7,Principal Component Analysis with Scikit-Learn,Niraj Verma,5835,9,21
8,Tips for Using Scikit-Learn for Evaluation,cass,1050,0,18
9,Scikit-learn pipelines and pandas,JanKoch,7092,8,15
10,Classifying News Headlines with scikit-learn,Ed King,4001,1,13


In [44]:
tokens = ['tensorflow', 'tensor flow']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,TensorFlow CNN,Kirill Kliavin,104261,183,524
2,Predicting Fraud (TensorFlow),Currie32,50269,47,168
3,Nuclei Segmentation TensorFlow U-Net 0.27 Prec.,Raoul,8650,31,131
4,Cancer Image TensorFlow CNN 80% Valid. Acc.,Raoul,5555,8,64
5,Data augmentation and Tensorflow U-Net,ShenShen,14503,10,108
6,Working in TensorFlow and Keras,DanB,31294,31,103
7,Tensorflow starter: conv1d + embeddings (0.442 LB),ololo,3940,27,75
8,Multi-GPU tensorflow convnet,Adam Blazek,11910,66,59
9,Neural Network Model for House Prices (TensorFlow),Julien Heiduk,15530,13,57
10,Basic U-net using Tensorflow,Vijay Jadhav,8761,24,45


In [45]:
tokens = ['theano']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Theano+Lasange Starter,Florian Muellerklein,12570,24,25
2,Fast LeNet5 CNN in Theano for GPU,Lukasz 8000,13151,9,22
3,Training a U-Net model in keras Theano,Ramiro Debbe,1405,2,8
4,Open dataset - theano tensor first image,Paul Larmuseau,233,0,3
5,tensor theano,Alaa Awad,745,0,2
6,TheanoLasagne - Fork Florian Muellerkle,Andre lopes,829,0,2
7,Practice Theano Logistic Regression,Andrew Blaikie,125,0,1
8,Test_for_theano,jack,423,0,1
9,Fast LeNet5 CNN in Theano for GPU,Jundong Qiao,161,0,1
10,Theano conv network,Tehnar,674,0,1


In [46]:
tokens = ['keras']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Digits Recognizer - Keras CNN - 0.997 (top 8%),Yassine Ghouzam,48796,210,1052
2,Keras U-Net starter,Kjetil Åmdal-Sævik,75705,131,795
3,Simple Keras Model for Beginners (0.201 on LB)+EDA,DeveshMaheshwari,23077,84,362
4,Deep Neural Network Keras way,Poonam Ligade,37619,89,283
5,[For Beginners] Tackling Toxic Using Keras,Bongo,14593,35,268
6,A simple nn solution with Keras,noobhound,19865,79,240
7,Use pretrained Keras models,beluga,28503,47,225
8,Keras - Bidirectional LSTM baseline,CVxTz,19363,41,197
9,End-to-end baseline with U-net (keras),n01z3,34160,130,163
10,CatdogNet - EDA and Keras ConvNet Starter,Jeff Delaney,36542,67,161


In [47]:
tokens = ['pytorch']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,PyTorch starting kit (data loading to training),Mamy Ratsimbazafy,13268,17,61
2,Pytorch Tutorial for Deep Learning Lovers,Kaan Can,2987,14,57
3,Recurrent Neural Network with Pytorch,Kaan Can,795,10,40
4,Statoil CSV PyTorch ensemble LB 0.1690,QuantScientist,7501,43,48
5,CNN Ratings - PyTorch,QuantScientist,10277,26,37
6,PyTorch GPU CNN & BCELoss with predictions,QuantScientist,7594,16,32
7,Pytorch starter,Austin,3154,9,37
8,Pre-Trained PyTorch Monkeys: A Deep Dream,paultimothymooney,1649,16,36
9,Simple PyTorch,Leigh,826,1,30
10,PyTorch Tutorials on DSB2018,Yun Chen,3598,12,29


In [48]:
tokens = ['vowpal wabbit','vowpalwabbit']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Vowpal Wabbit tutorial: blazingly fast learning,Yury Kashnitsky,10352,22,364
2,Topic 8. Online learning and Vowpal Wabbit,Yury Kashnitsky,70,0,12
3,"Fast, low memory learning - part 1: VowpalWabbit",Aimoldin Anuar [dsmlkz],549,3,37
4,Vowpal Wabbit - input file preparation,Konrad Banachewicz,310,1,11
5,Vowpal Wabbit,macDigger,187,2,4
6,Titanic encounters Vowpal Wabbit and R,Ivan Bajdarvanov,324,0,3
7,Part 2: Titanic encounters Vowpal Wabbit and R,Ivan Bajdarvanov,128,0,0
8,LRM Fast - Vowpal Wabbit Implementation,Pulkit Jha,99,3,2
9,Vowpal Wabbit decides who lives and who dies,Misha Lisovyi,66,0,1
10,try to understand vowpalwabbit,Evgenii Zhukov,29,0,0


In [49]:
tokens = ['tensorflow', 'tensor flow']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,TensorFlow CNN,Kirill Kliavin,104261,183,524
2,Predicting Fraud (TensorFlow),Currie32,50269,47,168
3,Nuclei Segmentation TensorFlow U-Net 0.27 Prec.,Raoul,8650,31,131
4,Cancer Image TensorFlow CNN 80% Valid. Acc.,Raoul,5555,8,64
5,Data augmentation and Tensorflow U-Net,ShenShen,14503,10,108
6,Working in TensorFlow and Keras,DanB,31294,31,103
7,Tensorflow starter: conv1d + embeddings (0.442 LB),ololo,3940,27,75
8,Multi-GPU tensorflow convnet,Adam Blazek,11910,66,59
9,Neural Network Model for House Prices (TensorFlow),Julien Heiduk,15530,13,57
10,Basic U-net using Tensorflow,Vijay Jadhav,8761,24,45


In [50]:
tokens = ['eli5']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,ELI5 for Mercari,Konstantin Lopuhin,11094,70,216
2,eli5 example,Konstantin Lopuhin,194,0,13
3,"Understanding Approval-DonorsChoose-EDA,FE,ELI5",Jagan,1437,5,34
4,ELI5 for TOXIC,Sergei Fironov,715,4,19
5,ELI5 What's Different About the Test Set? (EDA),Peter Hurford,297,4,14


## 8. Data Visualization

In [51]:
tokens = ['visualization', 'visualisation']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Python Data Visualizations,Ben Hamner,167395,116,761
2,Data investigation and visualization,DavidS,36642,87,631
3,Feature Selection and Data Visualization,Kaan Can,36157,185,447
4,Strength of visualization-python visuals tutorial,BuryBuryZymon,32138,92,396
5,!! In-Depth Analysis & Visualisations - AVITO !!,sban,14018,71,305
6,Geolocation visualisations,BeyondBeneath,20784,40,225
7,Detailed Cleaning/Visualization (Python),"Alan ""AJ"" Pryor, Ph.D.",34293,70,205
8,Detailed Cleaning/Visualization,"Alan ""AJ"" Pryor, Ph.D.",24053,38,148
9,Map visualizations with external shapefile,Jordan Tremoureux,4972,36,164
10,Welcome to data visualization,Aleksey Bilogur,46106,16,145


In [52]:
tokens = ['plotly', 'plot.ly']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,Interactive Insights via Plot.ly,Anisotropic,29062,128,414
2,Interactive Decision Boundaries - Python & Plotly,Anisotropic,16211,42,195
3,Our Young and their wasted Youth? PLOTLY visuals,Anisotropic,17880,73,188
4,SQL and Python primer - Bokeh and Plotly,Anisotropic,16671,32,168
5,Global Religion 1945-2010: Plotly/Pandas visuals,Anisotropic,6520,29,90
6,Plotly Tutorial for Beginners,Kaan Can,7784,66,187
7,Intermediate visualization tutorial using Plotly,Siddharth Yadav,2136,44,78
8,Mass Shooting in US (using plotly),Anton Aksyonov,4643,9,56
9,Wanna Explore Physics particle using Plotly ??,Lathwal,4559,3,53
10,EDA with Plotly,AdhokshajaPradeep,14873,38,47


In [53]:
tokens = ['seaborn']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,"Feat_select(corr,rfe,rfecv,PCA)Seaborn,RandForest",Kaan Can,36157,185,447
2,Seaborn for Beginners,Kaan Can,18195,118,215
3,Visualizing Pokémon Stats with Seaborn,Andrew Gelé,27150,51,158
4,Plotting with seaborn,Aleksey Bilogur,16091,19,78
5,Faceting with Seaborn,Aleksey Bilogur,9217,17,56
6,Humble Intro to Analysis with Pandas and Seaborn,Chris Crawford,9204,12,63
7,"Plotting with pandas, matplotlib, and seaborn",Data Framed,1321,19,41
8,IPL data exploration with seaborn,RanjeetJain,615,10,19
9,Seaborn Visualization,NeilS,2234,8,17
10,Python Seaborn PairPlot Example,Ben Hamner,28450,6,16


In [54]:
tokens = ['bokeh']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,SQL and Python primer - Bokeh Heatmaps,Anisotropic,16671,32,168
2,Bokeh Tutorial Data Visualization,Kaan Can,7255,7,63
3,Interactive Bokeh Tutorial Part 2,Kaan Can,2734,4,38
4,Karnataka Primary Education - EDA using Bokeh,Pavan Sanagapati,614,7,29
5,EDA with python library bokeh,naveenkb,395,1,12
6,Visualization with t-SNE and Bokeh,Yohan,1953,1,10
7,Exploratory Data Analysis with Bokeh,dtromero,1671,0,8
8,Visualization of trips using bokeh and Datashader,saihttam,604,2,8
9,Interactive Visualization with Bokeh!,Phil Butcher,4172,1,6
10,Exploring and Visualizing using bokeh,itzzthad,1676,3,5


## 8. Dimentionality Reduction

In [55]:
tokens = ['PCA']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,"Feat_select(corr, rfe, rfecv, PCA),RandForest",Kaan Can,36157,185,447
2,Image Completion and PCA,Kaan Can,3387,43,109
3,Customer Segments with PCA,Andrea,15880,16,114
4,"Dimensionality reduction (PCA, tSNE)",Tilii,6568,32,94
5,Visualizing PCA with Leafs,Selfish Gene,7571,18,66
6,"All You Need is PCA (LB: 0.11421, top 4%)",massquantity,4223,19,53
7,PCA visualization,Tuomas Tikkanen,17485,20,45
8,Aggregates + SumValues + SumZeros + K-Means + PCA,Samrat P,3054,20,43
9,Use Partial PCA for Collinearity,Gccering,1081,7,38
10,TSNE vs PCA,Mary Vikhreva,7612,4,36


In [56]:
tokens = ['Tsne', 't-sne']
best_kernels(tokens, 10)

0,1,2,3,4,5
1,"Dimensionality reduction (PCA, tSNE)",Tilii,6568,32,94
2,Four Blob TSNE - with (legal) supplements,Tilii,1452,9,26
3,Visualizing Word Vectors with t-SNE,Jeff Delaney,18358,20,74
4,Mapping digits with a t-SNE lens,Triskelion,10870,5,36
5,TSNE vs PCA,Mary Vikhreva,7612,4,36
6,TSNE & PCA Quick and Dirty Visuals,Anisotropic,2918,8,29
7,Clustering in 2 dimension using tsne,puyokw,20639,3,29
8,PCA and (TSNE),Øystein Schønning-Johansen,2818,8,28
9,Visualization on a 2D map (with t-SNE),Jean-Matthieu Schertzer,4526,2,25


<br>
Suggest the list of items which can be added to the list. If you liked this kernel, please upvote.  
