In [2]:
import pandas as pd

file_path = 'voted-kaggle-dataset.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,Title,Subtitle,Owner,Votes,Versions,Tags,Data Type,Size,License,Views,Download,Kernels,Topics,URL,Description
0,Credit Card Fraud Detection,Anonymized credit card transactions labeled as...,Machine Learning Group - ULB,1241,"Version 2,2016-11-05|Version 1,2016-11-03",crime\nfinance,CSV,144 MB,ODbL,"442,136 views","53,128 downloads","1,782 kernels",26 topics,https://www.kaggle.com/mlg-ulb/creditcardfraud,The datasets contains transactions made by cre...
1,European Soccer Database,"25k+ matches, players & teams attributes for E...",Hugo Mathien,1046,"Version 10,2016-10-24|Version 9,2016-10-24|Ver...",association football\neurope,SQLite,299 MB,ODbL,"396,214 views","46,367 downloads","1,459 kernels",75 topics,https://www.kaggle.com/hugomathien/soccer,The ultimate Soccer database for data analysis...
2,TMDB 5000 Movie Dataset,"Metadata on ~5,000 movies from TMDb",The Movie Database (TMDb),1024,"Version 2,2017-09-28",film,CSV,44 MB,Other,"446,255 views","62,002 downloads","1,394 kernels",46 topics,https://www.kaggle.com/tmdb/tmdb-movie-metadata,Background\nWhat can we say about the success ...
3,Global Terrorism Database,"More than 170,000 terrorist attacks worldwide,...",START Consortium,789,"Version 2,2017-07-19|Version 1,2016-12-08",crime\nterrorism\ninternational relations,CSV,144 MB,Other,"187,877 views","26,309 downloads",608 kernels,11 topics,https://www.kaggle.com/START-UMD/gtd,"Context\nInformation on more than 170,000 Terr..."
4,Bitcoin Historical Data,Bitcoin data at 1-min intervals from select ex...,Zielak,618,"Version 11,2018-01-11|Version 10,2017-11-17|Ve...",history\nfinance,CSV,119 MB,CC4,"146,734 views","16,868 downloads",68 kernels,13 topics,https://www.kaggle.com/mczielinski/bitcoin-his...,Context\nBitcoin is the longest running and mo...


In [3]:
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

Subtitle       104
Versions         5
Tags           542
Views            5
Download        15
Kernels        944
Topics         592
Description      5
dtype: int64

In [4]:
data['Description'].fillna('No description', inplace=True)

missing_descriptions = data['Description'].isnull().sum()
missing_descriptions, data['Description'].head()

(0,
 0    The datasets contains transactions made by cre...
 1    The ultimate Soccer database for data analysis...
 2    Background\nWhat can we say about the success ...
 3    Context\nInformation on more than 170,000 Terr...
 4    Context\nBitcoin is the longest running and mo...
 Name: Description, dtype: object)

In [5]:
import re
import string

def normalize_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

data['Description'] = data['Description'].apply(normalize_text)

data['Description'].head()

0    the datasets contains transactions made by cre...
1    the ultimate soccer database for data analysis...
2    background\nwhat can we say about the success ...
3    context\ninformation on more than  terrorist a...
4    context\nbitcoin is the longest running and mo...
Name: Description, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words='english')
data_vectorized = vectorizer.fit_transform(data['Description'])

number_of_topics = 10
lda = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
lda.fit(data_vectorized)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

Topic 0:
data dataset use model images set contains context features used
Topic 1:
dataset movie movies data pokemon time number model learning residual
Topic 2:
dataset data context acknowledgements number team content time contains inspiration
Topic 3:
data dataset content player number game information time contains file
Topic 4:
data dataset contains information number available content set acknowledgements database
Topic 5:
race horse time nominal data taken kumar marathon max zillow
Topic 6:
data dataset time content acknowledgements context information inspiration state health
Topic 7:
data university dataset number information content context use contains acknowledgements
Topic 8:
data dataset year country content world total years information context
Topic 9:
dataset data content contains context text price acknowledgements inspiration information


In [10]:
import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel, TfidfModel
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd

documents = data['Description'].fillna('No description').tolist()

stop_words = set(stopwords.words('english')) | {'additional', 'stopword1', 'stopword2'}
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def preprocess(doc):
    tokens = tokenizer.tokenize(doc.lower())
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

texts = [preprocess(doc) for doc in documents]

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=20, no_above=0.3)

corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lda_model = LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary, passes=20, alpha='auto', eta='auto')

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Topic: 0 
Words: 0.025*"train" + 0.017*"mile" + 0.015*"speech" + 0.015*"coordinate" + 0.014*"running" + 0.014*"twitter" + 0.013*"speaker" + 0.013*"removed" + 0.010*"taking" + 0.009*"duplicate"
Topic: 1 
Words: 0.367*"yet" + 0.239*"description" + 0.014*"pretrained" + 0.011*"model" + 0.008*"trained" + 0.008*"cell" + 0.007*"architecture" + 0.006*"transferable" + 0.005*"feature" + 0.005*"depth"
Topic: 2 
Words: 0.024*"borough" + 0.019*"york" + 0.018*"weather" + 0.015*"text" + 0.012*"city" + 0.010*"new" + 0.009*"nyc" + 0.009*"progress" + 0.009*"goal" + 0.009*"book"
Topic: 3 
Words: 0.055*"song" + 0.030*"music" + 0.026*"audio" + 0.021*"genre" + 0.020*"closed" + 0.017*"max" + 0.013*"surface" + 0.012*"care" + 0.012*"adding" + 0.011*"relative"
Topic: 4 
Words: 0.025*"nationality" + 0.020*"birth" + 0.019*"abbreviation" + 0.019*"improving" + 0.018*"word" + 0.016*"specie" + 0.014*"facility" + 0.012*"gas" + 0.010*"california" + 0.009*"key"
Topic: 5 
Words: 0.027*"customer" + 0.024*"candidate" + 0.0