# TBA 3102 - Text Analytics
## Practical Lab 09 - Text Summarization and Topic Models (II)
### Question 1 - Topic Modeling
Student: Nicky Ng <br>
GitHub User: [ahjimomo](https://github.com/ahjimomo) <br>
Student Number: A0194330L

### Libraries

In [1]:
# Data Wrangling
import numpy as np
import pandas as pd

# Topic Modeling
import nltk
import gensim

# Tokenizer & Feature Engineering
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

# Gemsim models
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel

# Parameters Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid

# Display DF
from IPython.core.display import HTML
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Import dataset

In [2]:
# Import cleaned dataset
raw_df = pd.read_csv('./data/voted-kaggled-dataset-cleaned.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           2143 non-null   int64 
 1   Title                2143 non-null   object
 2   Subtitle             2039 non-null   object
 3   Owner                2143 non-null   object
 4   Votes                2143 non-null   int64 
 5   Versions             2143 non-null   object
 6   Tags                 1603 non-null   object
 7   Data Type            2143 non-null   object
 8   Size                 2143 non-null   object
 9   License              2143 non-null   object
 10  Views                2143 non-null   object
 11  Download             2133 non-null   object
 12  Kernels              1205 non-null   object
 13  Topics               1557 non-null   object
 14  URL                  2143 non-null   object
 15  Description          2143 non-null   object
 16  Cleane

### Feature Engineering to prepare features

In [3]:
def normalize_corpus(descriptions):
    
    norm_description = []
    
    for description in descriptions:
        
        desc_tokens = [token.strip() for token in wtk.tokenize(description)]    
        #desc_tokens = [wnl.lemmatize(token) for token in desc_tokens if not token.isnumeric()]
        desc_tokens = [token for token in desc_tokens if len(token) > 1]
        desc_tokens = [token for token in desc_tokens if token not in stop_words]
        desc_tokens = list(filter(None, desc_tokens))
        
        norm_description.append(desc_tokens)
    
    return norm_description

In [4]:
processed_desc = list(raw_df['Cleaned_Description'])
norm_desc = normalize_corpus(processed_desc)
raw_df['normalized_description'] = norm_desc

In [5]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              2143 non-null   int64 
 1   Title                   2143 non-null   object
 2   Subtitle                2039 non-null   object
 3   Owner                   2143 non-null   object
 4   Votes                   2143 non-null   int64 
 5   Versions                2143 non-null   object
 6   Tags                    1603 non-null   object
 7   Data Type               2143 non-null   object
 8   Size                    2143 non-null   object
 9   License                 2143 non-null   object
 10  Views                   2143 non-null   object
 11  Download                2133 non-null   object
 12  Kernels                 1205 non-null   object
 13  Topics                  1557 non-null   object
 14  URL                     2143 non-null   object
 15  Desc

In [6]:
# Feature Cleaning
#bigram = gensim.models.Phrases(raw_df['normalized_description'], min_count = 20, threshold = 20, delimiter  = '_')
#bigram_model = gensim.models.phrases.Phraser(bigram)

# Creating both unigram & bigram
#norm_corpus = []
#for doc in raw_df['normalized_description']:
#    bigram_doc = bigram_model[doc]
#    norm_corpus.append(bigram_doc)
#print(bigram_doc)

# Create a dictionary representation of the document of unigram
dictionary = gensim.corpora.Dictionary(norm_desc)

# Filter out words that occur less than 20 documents, or more than 60% of the documents.
dictionary.filter_extremes(no_below = 20, no_above = 0.6)

# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_desc]

### fine-tuning & selecting optimal algorithm and model
* Latent Semantic Indexing: [LSI Parameters] (https://radimrehurek.com/gensim/models/lsimodel.html)
* Latent Dirichlet Allocation: [LDA parameters] (https://radimrehurek.com/gensim/models/ldamodel.html)

In [7]:
def finetuning_model(corpus, dictionary, algor, df):
    # Initialize dataframe and list to store results
    results = pd.DataFrame()
    coherence_cv_scores = []
    coherence_UMass_scores = []
    parameters = []
    algor_lst = []
    
    # Parameters
    topic_lst = [5, 6, 7, 8, 9, 10]
    random_seed = [42]
    
    # Compute hyperparameter grid
    if algor == 'LSI':
        hyperparams = {'num_topics': topic_lst, 'random_seed': random_seed, 'power_iters': [10, 50, 100]}
    elif algor == 'LDA':
        hyperparams = {'num_topics': topic_lst, 'random_state': random_seed, 'alpha': ['symmetric', 'auto'], 'passes': [1, 10], 'iterations': [50, 200, 500]}
    paramgrid = list(ParameterGrid(hyperparams))
    
    # Loop over parameter grid for LSI
    count = 0
    for params in paramgrid:
        if algor == 'LSI':
            model = LsiModel(corpus = corpus, id2word = dictionary, **params)
        elif algor == 'LDA':
            model = LdaModel(corpus = corpus, id2word = dictionary, **params)
        
        # Compute coherence score
        umass_model = CoherenceModel(model = model, corpus = corpus, dictionary = dictionary, coherence = 'u_mass')
        umass_score = umass_model.get_coherence()
        cv_model = CoherenceModel(model = model, texts = df['normalized_description'], dictionary = dictionary, coherence = 'c_v')
        cv_score = cv_model.get_coherence()
        
        # Store the results
        algor_lst.append(f'{algor}_{count}')
        parameters.append(params)
        coherence_cv_scores.append(cv_score)
        coherence_UMass_scores.append(umass_score)
        
        count += 1
        
    # Append result to result dataframe and return dataframe
    results['algorithm'] = algor_lst
    results['cv_score'] = coherence_cv_scores
    results['umass_score'] = coherence_UMass_scores
    results['parameters'] = parameters
    
    return results

In [8]:
# Perform Topic Modeling with LSI
LSI_results = finetuning_model(bow_corpus, dictionary, 'LSI', raw_df)
LSI_results

Unnamed: 0,algorithm,cv_score,umass_score,parameters
0,LSI_0,0.414374,-1.847259,"{'num_topics': 5, 'power_iters': 10, 'random_seed': 42}"
1,LSI_1,0.414374,-1.847259,"{'num_topics': 5, 'power_iters': 50, 'random_seed': 42}"
2,LSI_2,0.43057,-1.978477,"{'num_topics': 5, 'power_iters': 100, 'random_seed': 42}"
3,LSI_3,0.425324,-1.916569,"{'num_topics': 6, 'power_iters': 10, 'random_seed': 42}"
4,LSI_4,0.481299,-2.086907,"{'num_topics': 6, 'power_iters': 50, 'random_seed': 42}"
5,LSI_5,0.383058,-1.793504,"{'num_topics': 6, 'power_iters': 100, 'random_seed': 42}"
6,LSI_6,0.383131,-2.045915,"{'num_topics': 7, 'power_iters': 10, 'random_seed': 42}"
7,LSI_7,0.383131,-2.045915,"{'num_topics': 7, 'power_iters': 50, 'random_seed': 42}"
8,LSI_8,0.412386,-1.952091,"{'num_topics': 7, 'power_iters': 100, 'random_seed': 42}"
9,LSI_9,0.450456,-4.127868,"{'num_topics': 8, 'power_iters': 10, 'random_seed': 42}"


In [9]:
# Perform Topic Modeling with LDA
LDA_results = finetuning_model(bow_corpus, dictionary, 'LDA', raw_df)
LDA_results

Unnamed: 0,algorithm,cv_score,umass_score,parameters
0,LDA_0,0.393777,-1.706755,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 5, 'passes': 1, 'random_state': 42}"
1,LDA_1,0.393518,-1.895762,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 5, 'passes': 10, 'random_state': 42}"
2,LDA_2,0.386449,-1.737513,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 6, 'passes': 1, 'random_state': 42}"
3,LDA_3,0.395409,-1.911878,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 6, 'passes': 10, 'random_state': 42}"
4,LDA_4,0.373664,-1.770506,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 7, 'passes': 1, 'random_state': 42}"
5,LDA_5,0.412372,-1.866418,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 7, 'passes': 10, 'random_state': 42}"
6,LDA_6,0.382214,-1.874991,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 8, 'passes': 1, 'random_state': 42}"
7,LDA_7,0.388025,-1.984246,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 8, 'passes': 10, 'random_state': 42}"
8,LDA_8,0.376307,-1.808159,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 9, 'passes': 1, 'random_state': 42}"
9,LDA_9,0.383854,-2.218714,"{'alpha': 'symmetric', 'iterations': 50, 'num_topics': 9, 'passes': 10, 'random_state': 42}"


In [10]:
# Concat results together
full_results = pd.concat([LSI_results, LDA_results])

In [11]:
# Show results
full_results

Unnamed: 0,algorithm,cv_score,umass_score,parameters
0,LSI_0,0.414374,-1.847259,"{'num_topics': 5, 'power_iters': 10, 'random_seed': 42}"
1,LSI_1,0.414374,-1.847259,"{'num_topics': 5, 'power_iters': 50, 'random_seed': 42}"
2,LSI_2,0.43057,-1.978477,"{'num_topics': 5, 'power_iters': 100, 'random_seed': 42}"
3,LSI_3,0.425324,-1.916569,"{'num_topics': 6, 'power_iters': 10, 'random_seed': 42}"
4,LSI_4,0.481299,-2.086907,"{'num_topics': 6, 'power_iters': 50, 'random_seed': 42}"
5,LSI_5,0.383058,-1.793504,"{'num_topics': 6, 'power_iters': 100, 'random_seed': 42}"
6,LSI_6,0.383131,-2.045915,"{'num_topics': 7, 'power_iters': 10, 'random_seed': 42}"
7,LSI_7,0.383131,-2.045915,"{'num_topics': 7, 'power_iters': 50, 'random_seed': 42}"
8,LSI_8,0.412386,-1.952091,"{'num_topics': 7, 'power_iters': 100, 'random_seed': 42}"
9,LSI_9,0.450456,-4.127868,"{'num_topics': 8, 'power_iters': 10, 'random_seed': 42}"


In [12]:
# Rearrange results
results = full_results.sort_values('umass_score', ascending = True).sort_values('cv_score', ascending = False)
results

Unnamed: 0,algorithm,cv_score,umass_score,parameters
11,LSI_11,0.514781,-5.370141,"{'num_topics': 8, 'power_iters': 100, 'random_seed': 42}"
4,LSI_4,0.481299,-2.086907,"{'num_topics': 6, 'power_iters': 50, 'random_seed': 42}"
10,LSI_10,0.47578,-4.146934,"{'num_topics': 8, 'power_iters': 50, 'random_seed': 42}"
9,LSI_9,0.450456,-4.127868,"{'num_topics': 8, 'power_iters': 10, 'random_seed': 42}"
16,LSI_16,0.448502,-3.874545,"{'num_topics': 10, 'power_iters': 50, 'random_seed': 42}"
2,LSI_2,0.43057,-1.978477,"{'num_topics': 5, 'power_iters': 100, 'random_seed': 42}"
13,LSI_13,0.428188,-4.118091,"{'num_topics': 9, 'power_iters': 50, 'random_seed': 42}"
3,LSI_3,0.425324,-1.916569,"{'num_topics': 6, 'power_iters': 10, 'random_seed': 42}"
15,LSI_15,0.415627,-3.660034,"{'num_topics': 10, 'power_iters': 10, 'random_seed': 42}"
12,LSI_12,0.415557,-4.118701,"{'num_topics': 9, 'power_iters': 10, 'random_seed': 42}"


Using perplexity and coherence scores as measures to evaluate the topic model, the model would be better if
- Lower the UMass score
- Higher the Cv score

In [13]:
results.iloc[0]

algorithm                                                        LSI_11
cv_score                                                       0.514781
umass_score                                                   -5.370141
parameters     {'num_topics': 8, 'power_iters': 100, 'random_seed': 42}
Name: 11, dtype: object

In [14]:
final_df = raw_df.copy()

In [15]:
# 4c. Determine the most dominant topic for each document using the best model
best_model = LsiModel(corpus = bow_corpus, id2word = dictionary,
                      power_iters = 100, num_topics = 8,
                      random_seed = 42)

# Compute dominant topics for each document
topic_weights = []
for row in best_model[bow_corpus]:
    topic_weights.append(dict(row))
    
topic_weights_df = pd.DataFrame(topic_weights)

# List to store topic and keywords
dominant_topics = []
topic_keywords = []

for i, row in topic_weights_df.iterrows():
    sorted_topics = sorted(row.items(), key = lambda x: x[1], reverse = True)
    top_topic = f"Topic {sorted_topics[0][0]} ({sorted_topics[0][1]:.3f})"
    dominant_topics.append(top_topic)
    
    # Get top keywords for each topic
    keywords = [word for (word, prob) in best_model.show_topic(sorted_topics[0][0], topn=10)]
    topic_keywords.append(keywords)
    
# Append topic back to dataframe
final_df['Dominant_Topic'] = dominant_topics
final_df['Topic_Keywords'] = topic_keywords

In [16]:
final_df.to_csv('./data/corpus_topic_best.csv')

In [17]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 0')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].head()

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
1140,stick overflow dataset,Topic 0 (nan),"[university, state, college, use, player, institute, number, contain, time, file]"
1515,loan datum,Topic 0 (nan),"[university, state, college, use, player, institute, number, contain, time, file]"
1925,moist datum htpneuralnetworksandeplearningcom,Topic 0 (nan),"[university, state, college, use, player, institute, number, contain, time, file]"


In [19]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 1')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].head()

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
1811,context conical metadata package idptb namepen trepak copyrightcopyright c university pennsylvania licensethis sub full penn trepak corpus version,Topic 1 (0.134),"[player, use, university, number, time, file, contain, year, team, name]"
1866,context conical metadata package idparadigm nameparadigm corpus authorcathy bow university melbourne licensedistribute permission author,Topic 1 (0.171),"[player, use, university, number, time, file, contain, year, team, name]"
2064,context datum grab news htpswusnewscom follow datum point include datum set bank aceptancerate active stag photo cost financial aid city surname zip percent receive aid state average high school gap tie bank publicprivate university business reputation score suction engineering reputation score enrollment size region include statistic surround follow university princeton university harvard university university chicago yale university columbia university massachusetts institute technology stanford university university pennsylvania duke university california institute technology dartmouth college john hopkin university northwestern university brown university cornell university rice university vanderbilt university university notre dame washington university st louis georgetown university memory university university californiaberkeley university californian angeles university southern california carnegie melon university university virginia wake forest university university michigan harbor tufts university new york university university north carolinachapel hill boston college college william mary brand university georgia institute technology university rochester boston university case western reserve university university californiasanta barbara northwestern university lane university rensselaer polytechnic institute university californiairvine university californian diego university florida high university peperdine university university californiadavis university mimi university wisconsinmadison vilanova university pennsylvania state universityuniversity park university ilinoisurbanachampaign ohio state universitycolumbus university georgia george washington university pursue universityw lafayette university connecticut university texasaustin university washington brigham young universityprovo fordham university southern method university syracuse university university marylandcolege park worcester polytechnic institute crimson university university pittsburgh american university ruler universitynew brunswick stevens institute technology texas universitycolege station university minesotatwin city virginia teach taylor university colorado school mine university masachusetsamherst mimi universityoxford texas christian university university iowa clark university florida state university michigan state university north carolina state universityraleigh university californiasanta cruz university delaware binghamton universitysuny university denver university tula indiana universityblomington marquette university university coloradoboulder university san diego repel university saint louis university yeshiva university rochester institute technology stony brook universitysuny sun college environmental science forestry university bufalosuny university oklahoma university vermont autumn university illinois institute technology loyal university chicago university new hampshire university oregon university south carolina university tennessee howard university university alabama university san francisco university pacific university utah arizona state universitytempe iowa state university temple university university kansas university st thomas catholic university america deal university duquesne university university missouri clarion university colorado state university michigan technological university seton hall university university arizona university californiariverside university dayton university nebraskalincoln hofstra university louisiana state universitybaton rough merger university new school ruler universitynewark university arkansas university cincinnati university kentucky george mason university new jersey institute technology san diego state university university south florida washington state university kansas state university oregon state university st john fish college university ilinoischicago university mississippi university texasdala delhi university florida institute technology ohio university seattle pacific university university albanysuny oklahoma state university university masachusetslowel university rhode island bill university illinois state university university alabamabirmingham university hawaimanoa university la verse university marylandbaltimore county immaculate university maryvile university st louis missouri university science technology st john university university californiamerce university louisville mississippi state university roman university university central florida university idaho virginia commonwealth university kent state university robert morris university texas teach university union university university hartford edge college less university lipscomb university folk university university maine university wyoming abuse pacific university ball state university montclair state university pace university west virginia university andrew university indiana universitypurdue universityindianapolis university houston university new mexico university north dakota widen university new mexico state university north dakota state university nova southwestern university university north carolinacharlote bow green state university california state universityfulerton dallas baptist university university masachusetsboston university nevadareno central michigan university east carolina university florida university montana state university university alaskafairbank university coloradodenver university masachusetsdartmouth university montana western michigan university florida international university louisiana teach university south dakota state university southern illinois universitycarbondale university alabamahuntsvile university misourikansa city utah state university island university benediction university california state universityfresno gardnerweb university georgia state university shenandoah university university south dakota wayne state university american international college augusta university mary university noise state university cardinal stretch university clark atlanta university cleveland state university eastern michigan university east tennessee state university florida atlantic university georgia southern university grand canyon university indiana state university indiana university pennsylvania jackson state university kenesaw state university later university liberty university lindenwod university middle tennessee state university morgan state university national louis university north carolina state university northern arizona university northern illinois university oakland university old dominion university poland state university prairie view university regent university sam houston state university san francisco state university spaulde university tennessee state university tennessee technological university texas universitycomerce texas universitycorpus christ texas universitykingsvile texas southern university texas state university texas woman university treveca nazarene university trinity international university university akron university arkansaslitle rock university louisianalafayete university louisianamonroe university marylandeastern shore university memphis university misourist louis university nebraskaomaha university nevadala begin university new orleans university north carolinagrensboro university northern colorado university north texas university south alabama university southern mississippi university texasarlington university texas past university texasrio grande valley university texas antonio university cumberland university tell university west florida university west georgia university wisconsinmilwauke valdosta state university wichita state university wright state university valiant international university arise university california institute integral study patella university idaho state university northcentral university strident university international union institute university university phoenix walk university wilmington university,Topic 1 (56.209),"[player, use, university, number, time, file, contain, year, team, name]"
2083,dataset contain college common datum set contain common datum set follow school alabama state university angelo state university arapahoe community college arkansas teach university aroma university baldwin palace university college bemidji state university college binghamton university boston university buckle university cabin university california baptist university california state university bakersfield california state university long beach california state university los angeles california state university sacramento carnegie melon university case western reserve university christopher newport university clark university coley college college charleston coin college colorado college colorado school mine colorado state universitypueblo columbia college concord university texas cornell university davidson college delaware technical community college dessalle university dickinson college drake university draw university duquesne university east central university eastern washington university embryo riddle aeronautical universitydaytona beach fairfield university florida gulf coast florida international university fort hay state university georgia institute technology gettysburg college hamilton college holy university humboldt state university iowa state university jackson state university john jay college criminal justice kenesaw state university lafayette college lane college lee university le one college senior rhine university life university loyal university maryland lubbock christian university come college len university common datum set alone university marlboro college maryvile university massachusetts maritime academy metropolitan state university denver michigan technological university midlebury college milersvile university mississippi state university mott community college neumann university northwestern state university northern arizona university northern kentucky university back college oklahoma christian university oklahoma state university old dominion university oral roberts university peperdine university corona college prescott college providence college reed college leg university rensselaer polytechnic institute rice university rochester college ruler university saint vincent college san francisco state university santa clara university strip college seton hill university sewene shipensburg university simpson university slippery rock university smith college sonya state university southwestern community college southwestern oklahoma state university southwestern oklahoma state university springfield college st arise university stanford university stephen f austin state university sun oneonta sun potsdam sweet briar college taylor university temple university tennessee wesleyan university texas university kingsvile texas university texas wesleyan university college brockport college new jersey university stanton university southern mississippi university tennessee trinity university tufts university lane university tula community college university buffalo university enrollment university california davis university california riverside university colorado bolder university delaware university kentucky university louisville university maine university missouri university montana university mount olive university nebraska kearney university nebraskalincoln university nevada rent university new hampshire university new mexico university north alabama university north carolina charlotte university pennsylvania university pikevile university puget sound university science art university texas rio university science philadelphia university wisconsin university wide common datum set vilanova university virginia commonwealth university washburn university washington lee university washington college state university wellesley college wesleyan university westfield state university westminster college wheat college whitman college widen university worcester polytechnic institute easy university louisiana easy university yale university,Topic 1 (24.906),"[player, use, university, number, time, file, contain, year, team, name]"


In [33]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 2')][['Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Description,Dominant_Topic,Topic_Keywords
74,"Data on shots taken during the 2014-2015 season, who took the shot, where on the floor was the shot taken from, who was the nearest defender, how far away was the nearest defender, time on the shot clock, and much more. The column titles are generally self-explanatory.\nUseful for evaluating who the best shooter is, who the best defender is, the hot-hand hypothesis, etc.\nScraped from NBA's REST API.",Topic 2 (0.915),"[player, team, shoot, goal, face, ice, attempt, use, weight, take]"
2056,"Context\nThis dataset mainly features the score changes during badminton games in the rally-point system.\nContent\nThe dataset contains 11872 games from 5131 matches in BWF Super Series Tournaments. There are 6 fields:\n- Year: I collected 3 years data: 2015-2017.\n- Tournament: For each year, there are 12 Super Series Tournaments.\n- Round: 1 - Round 1; 2 - Round 2; Q - Quarter Finals; S - Semi-Finals; F - Finals\n- Match: Information about the countries of the players.\n- Type: MS - Men's Single; WS - Women's Single; MD - Men's Double; WD - Women's Double; XD - Mixed Double\n- Scores: Score changes during the games.\nAcknowledgements\nThe dataset was collected from bwfbadminton.com. I wrote codes to scrap the information.\nInspiration\nPerformance of the players is reflected on how the score changes during the games. Exploring this information may help us to predict or learn something related to badminton games.",Topic 2 (0.811),"[player, team, shoot, goal, face, ice, attempt, use, weight, take]"
1778,"Context\nAs many others I have asked myself if it is possible to use machine learning in order to create valid predictions for football (soccer) match outcomes. Hence I created a dataset consisting of historic match data for the German Bundesliga (1st and 2nd Division) as well as the English Premier League reaching back as far as 1993 up to 2016. Besides the mere information concerning goals scored and home/draw/away win the dataset also includes per site (team) data such as transfer value per team (pre-season), the squad strength, etc. Unfortunately I was only able to find sources for these advanced attributes going back to the 2005 season.\nI have used this dataset with different machine learning algorithms including random forests, XGBoost as well as different recurrent neural network architectures (in order to potentially identify recurring patterns in winning streaks, etc.). I'd like to share the approaches I used as separate Kernels here as well. So far I did not manage to exceed an accuracy of 53% consistently on a validation set using 2016 season of Bundesliga 1 (no information rate = 49%).\nAlthough I have done some visual exploration before implementing the different machine learning approaches using Tableau, I think a visual exploration kernel would be very beneficial.\nContent\nThe data comes as an Sqlite file containing the following tables and fields:\nTable: Matches\nMatch_ID (int): unique ID per match\nDiv (str): identifies the division the match was played in (D1 = Bundesliga, D2 = Bundesliga 2, E0 = English Premier League)\nSeason (int): Season the match took place in (usually covering the period of August till May of the following year)\nDate (str): Date of the match\nHomeTeam (str): Name of the home team\nAwayTeam (str): Name of the away team\nFTHG (int) (Full Time Home Goals): Number of goals scored by the home team\nFTAG (int) (Full Time Away Goals): Number of goals scored by the away team\nFTR (str) (Full Time Result): 3-way result of the match (H = Home Win, D = Draw, A = Away Win)\nTable: Teams\nSeason (str): Football season for which the data is valid\nTeamName (str): Name of the team the data concerns\nKaderHome (str): Number of Players in the squad\nAvgAgeHome (str): Average age of players\nForeignPlayersHome (str): Number of foreign players (non-German, non-English respectively) playing for the team\nOverallMarketValueHome (str): Overall market value of the team pre-season in EUR (based on data from transfermarkt.de)\nAvgMarketValueHome (str): Average market value (per player) of the team pre-season in EUR (based on data from transfermarkt.de)\nStadiumCapacity (str): Maximum stadium capacity of the team's home stadium\nTable: Unique Teams\nTeamName (str): Name of a team\nUnique_Team_ID (int): Unique identifier for each team\nTable: Teams_in_Matches\nMatch_ID (int): Unique match ID\nUnique_Team_ID (int): Unique team ID (This table is used to easily retrieve each match a given team has played in)\nBased on these tables I created a couple of views which I used as input for my machine learning models:\nView: FlatView\nCombination of all matches with the respective additional data from Teams table for both home and away team.\nView: FlatView_Advanced\nSame as Flatview but also includes Unique_Team_ID and Unique_Team in order to easily retrieve all matches played by a team in chronological order.\nView: FlatView_Chrono_TeamOrder_Reduced\nSimilar to Flatview_Advanced, however missing the additional attributes from team in order to have a longer history including years 1993 - 2004. Especially interesting if one is only interested in analyzing winning/loosing streaks.\nAcknowledgements\nThanks to football-data.co.uk and transfermarkt.de for providing the raw data used in this dataset.\nInspiration\nPlease feel free to use the humble dataset provided here for any purpose you want. To me it would be most interesting if others think that recurrent neural networks could in fact be of help (and even maybe outperform classical feature engineering) in identifying streaks of losses and wins. In the literature I mostly only found example of RNN application where the data were time series in a very narrow sense (e.g. temperature measurements over time) hence it would be interesting to get your input on this question.\nMaybe someone also finds additional attributes per team or match which have substantial impact on match outcome. So far I have found the ""Market Value"" of a team to be by far the best predictor when two teams face each other, which makes sense as the market value usually tends to correlate closely with the strength of a team and it's propects at winning",Topic 2 (4.392),"[player, team, shoot, goal, face, ice, attempt, use, weight, take]"
449,"Context\nThis data set can be paired with the shot logs data set from the same season.\nContent\nFull players stats from the 2014-2015 season + personal details such as height. weight, etc.\nThe data was scraped and copied from: http://www.basketball-reference.com/teams/ and http://stats.nba.com/leaders#!?Season=2014-15&SeasonType=Regular%20Season&StatCategory=MIN&CF=MIN*G*2&PerMode=Totals",Topic 2 (0.905),"[player, team, shoot, goal, face, ice, attempt, use, weight, take]"
626,"Context & Content\nThis dataset features the salaries of 874 nhl players for the 2016/2017 season. I have randomly split the players into a training (612 players) and test (262 players) populations. There are 151 predictor columns (described in column legend section, if you're not familiar with hockey the meaning of some of these may be a bit cryptic!) as well as a leading column with the players 2016/2017 annual salary. For the test population the actual salaries have been broken off into a separate .csv file.\nAcknowledgements\nRaw excel sheet was acquired http://www.hockeyabstract.com/\nInspiration\nCan you build a model to predict NHL player's salaries? What are the best predictors of how much a player will make?\nColumn Legend\nAcronym - Meaning\n%FOT - Percentage of all on-ice faceoffs taken by this player.\n+/- - Plus/minus\n1G - First goals of a game\nA/60 - Events Against per 60 minutes, defaults to Corsi, but can be set to another stat\nA1 - First assists, primary assists\nA2 - Second assists, secondary assists\nBLK% - Percentage of all opposing shot attempts blocked by this player\nBorn - Birth date\nC.Close - A player shot attempt (Corsi) differential when the game was close\nC.Down - A player shot attempt (Corsi) differential when the team was trailing\nC.Tied - A player shot attempt (Corsi) differential when the team was tied\nC.Up - A player shot attempt (Corsi) differential when the team was in the lead\nCA - Shot attempts allowed (Corsi, SAT) while this player was on the ice\nCap Hit - The player's cap hit\nCBar - Crossbars hit\nCF - The team's shot attempts (Corsi, SAT) while this player was on the ice\nCF.QoC - A weighted average of the Corsi percentage of a player's opponents\nCF.QoT - A weighted average of the Corsi percentage of a player's linemates\nCHIP - Cap Hit of Injured Player is games lost to injury multiplied by cap hit per game\nCity - City of birth\nCntry - Country of birth\nDAP - Disciplined aggression proxy, which is hits and takeaways divided by minor penalties\nDFA - Dangerous Fenwick against, which is on-ice unblocked shot attempts weighted by shot quality\nDFF - Dangerous Fenwick for, which is on-ice unblocked shot attempts weighted by shot quality\nDFF.QoC - Quality of Competition metric based on Dangerous Fenwick, which is unblocked shot attempts weighted for shot quality\nDftRd - Round in which the player was drafted\nDftYr - Year drafted\nDiff - Events for minus event against, defaults to Corsi, but can be set to another stat\nDiff/60 - Events for minus event against, per 60 minutes, defaults to Corsi, but can be set to another stat\nDPS - Defensive point shares, a catch-all stats that measures a player's defensive contributions in points in the standings\nDSA - Dangerous shots allowed while this player was on the ice, which is rebounds plus rush shots\nDSF - The team's dangerous shots while this player was on the ice, which is rebounds plus rush shots\nDZF - Shifts this player has ended with an defensive zone faceoff\ndzFOL - Faceoffs lost in the defensive zone\ndzFOW - Faceoffs win in the defensive zone\ndzGAPF - Team goals allowed after faceoffs taken in the defensive zone\ndzGFPF - Team goals scored after faceoffs taken in the defensive zone\nDZS - Shifts this player has started with an defensive zone faceoff\ndzSAPF - Team shot attempts allowed after faceoffs taken in the defensive zone\ndzSFPF - Team shot attempts taken after faceoffs taken in the defensive zone\nE+/- - A player's expected +/-, based on his team and minutes played\nENG - Empty-net goals\nExp dzNGPF - Expected goal differential after faceoffs taken in the defensive zone, based on the number of them\nExp dzNSPF - Expected shot differential after faceoffs taken in the defensive zone, based on the number of them\nExp ozNGPF - Expected goal differential after faceoffs taken in the offensive zone, based on the number of them\nExp ozNSPF - Expected shot differential after faceoffs taken in the offensive zone, based on the number of them\nF.Close - A player unblocked shot attempt (Fenwick) differential when the game was close\nF.Down - A player unblocked shot attempt (Fenwick) differential when the team was trailing\nF.Tied - A player unblocked shot attempt (Fenwick) differential when the team was tied\nF.Up - A player unblocked shot attempt (Fenwick) differential when the team was in the lead. Not the best acronym.\nF/60 - Events For per 60 minutes, defaults to Corsi, but can be set to another stat\nFA - Unblocked shot attempts allowed (Fenwick, USAT) while this player was on the ice\nFF - The team's unblocked shot attempts (Fenwick, USAT) while this player was on the ice\nFirst Name -\nFO% - Faceoff winning percentage\nFO%vsL - Faceoff winning percentage against lefthanded opponents\nFO%vsR - Faceoff winning percentage against righthanded opponents\nFOL - The team's faceoff losses while this player was on the ice\nFOL.Close - Faceoffs lost when the score was close\nFOL.Down - Faceoffs lost when the team was trailing\nFOL.Up - Faceoffs lost when the team was in the lead\nFovsL - Faceoffs taken against lefthanded opponents\nFovsR - Faceoffs taken against righthanded opponents\nFOW - The team's faceoff wins while this player was on the ice\nFOW.Close - Faceoffs won when the score was close\nFOW.Down - Faceoffs won when the team was trailing\nFOW.Up - Faceoffs won when the team was in the lead\nG - Goals\nG.Bkhd - Goals scored on the backhand\nG.Dflct - Goals scored with deflections\nG.Slap - Goals scored with slap shots\nG.Snap - Goals scored with snap shots\nG.Tip - Goals scored with tip shots\nG.Wrap - Goals scored with a wraparound\nG.Wrst - Goals scored with a wrist shot\nGA - Goals allowed while this player was on the ice\nGame - Game Misconduct penalties\nGF - The team's goals while this player was on the ice\nGP - Games Played\nGrit - Defined as hits, blocked shots, penalty minutes, and majors\nGS - The player's combined game score\nGS/G - The player's average game score\nGVA - The team's giveaways while this player was on the ice\nGWG - Game-winning goals\nGWG - Game-winning goals\nHA - The team's hits taken while this player was on the ice\nHand - Handedness\nHF - The team's hits thrown while this player was on the ice\nHopFO - Opening faceoffs taken at home\nHopFOW - Opening faceoffs won at home\nHt - Height\niBLK - Shots blocked by this individual\niCF - Shot attempts (Corsi, SAT) taken by this individual\niDS - Dangerous shots taken by this player, the sum of rebounds and shots off the rush\niFF - Unblocked shot attempts (Fenwick, USAT) taken by this individual\niFOL - Faceoff losses by this individual\niFOW - Faceoff wins by this individual\niGVA - Giveaways by this individual\niHA - Hits taken by this individual\niHDf - The difference in hits thrown by this individual minus those taken\niHF - Hits thrown by this individual\niMiss - Individual shots taken that missed the net.\nInjuries - List of types of injuries incurred, if any\niPEND - Penalties drawn by this individual\niPenDf - The difference in penalties drawn minus those taken\niPENT - Penalties taken by this individual\nIPP% - Individual points percentage, which is on-ice goals for which this player had the goal or an assist\niRB - Rebound shots taken by this individual\niRS - Shots off the rush taken by this individual\niSCF - All scoring chances taken by this individual\niSF - Shots on goal taken by this individual\niTKA - Takeaways by this individual\nixG - Expected goals (weighted shots) for this individual, which is shot attempts weighted by shot location\nLast Name -\nMaj - Major penalties taken\nMatch - Match penalties\nMGL - Games lost due to injury\nMin - Minor penalties taken\nMisc - Misconduct penalties\nNat - Nationality\nNGPF - Net Goals Post Faceoff. A differential of all goals within 10 seconds of a faceoff, relative to expectations set by the zone in which they took place\nNHLid - NHL player id useful when looking at the raw data in game files\nNMC - What kind of no-movement clause this player's contract has, if any\nNPD - Net Penalty Differential is the player's penalty differential relative to a player of the same position with the same ice time per manpower situation\nNSPF - Net Shots Post Faceoff. A differential of all shot attempts within 10 seconds of a faceoff, relative to expectations set by the zone in which they took place\nNZF - Shifts this player has ended with a neutral zone faceoff\nnzFOL - Faceoffs lost in the neutral zone\nnzFOW - Faceoffs won in the neutral zone\nnzGAPF - Team goals allowed after faceoffs taken in the neutral zone\nnzGFPF - Team goals scored after faceoffs taken in the neutral zone\nNZS - Shifts this player has started with a neutral zone faceoff\nnzSAPF - Team shot attempts allowed after faceoffs taken in the neutral zone\nnzSFPF - Team shot attempts taken after faceoffs taken in the neutral zone\nOCA - Shot attempts allowed (Corsi, SAT) while this player was not on the ice\nOCF - The team's shot attempts (Corsi, SAT) while this player was not on the ice\nODZS - Defensive zone faceoffs that occurred without this player on the ice\nOFA - Unblocked shot attempts allowed (Fenwick, USAT) while this player was not on the ice\nOFF - The team's unblocked shot attempts (Fenwick, USAT) while this player was not on the ice\nOGA - Goals allowed while this player was not on the ice\nOGF - The team's goals while this player was not on the ice\nONZS - Neutral zone faceoffs that occurred without this player on the ice\nOOZS - Offensive zone faceoffs that occurred without this player on the ice\nOpFO - Opening faceoffs taken\nOpFOW - Opening faceoffs won\nOppCA60 - A weighted average of the shot attempts (Corsi, SAT) the team allowed per 60 minutes of a player's opponents\nOppCF60 - A weighted average of the shot attempts (Corsi, SAT) the team generated per 60 minutes of a player's opponents\nOppFA60 - A weighted average of the unblocked shot attempts (Fenwick, USAT) the team allowed per 60 minutes of a player's opponents\nOppFF60 - A weighted average of the unblocked shot attempts (Fenwick, USAT) the team generated per 60 minutes of a player's opponents\nOppGA60 - A weighted average of the goals the team allowed per 60 minutes of a player's opponents\nOppGF60 - A weighted average of the goals the team scored per 60 minutes of a player's opponents\nOppSA60 - A weighted average of the shots on goal the team allowed per 60 minutes of a player's opponents\nOppSF60 - A weighted average of the shots on goal the team generated per 60 minutes of a player's opponents\nOPS - Offensive point shares, a catch-all stats that measures a player's offensive contributions in points in the standings\nOSA - Shots on goal allowed while this player was not on the ice\nOSCA - Scoring chances allowed while this player was not on the ice\nOSCF - The team's scoring chances while this player was not on the ice\nOSF - The team's shots on goal while this player was not on the ice\nOTF - Shifts this player started with an on-the-fly change\nOTG - Overtime goals\nOTOI - The amount of time this player was not on the ice.\nOver - Shots that went over the net\nOvrl - Where the player was drafted overall\nOxGA - Expected goals allowed (weighted shots) while this player was not on the ice, which is shot attempts weighted by location\nOxGF - The team's expected goals (weighted shots) while this player was not on the ice, which is shot attempts weighted by location\nOZF - Shifts this player has ended with an offensive zone faceoff\nozFO - Faceoffs taken in the offensive zone\nozFOL - Faceoffs lost in the offensive zone\nozFOW - Faceoffs won in the offensive zone\nozGAPF - Team goals allowed after faceoffs taken in the offensive zone\nozGFPF - Team goals scored after faceoffs taken in the offensive zone\nOZS - Shifts this player has started with an offensive zone faceoff\nozSAPF - Team shot attempts allowed after faceoffs taken in the offensive zone\nozSFPF - Team shot attempts taken after faceoffs taken in the offensive zone\nPace - The average game pace, as estimated by all shot attempts per 60 minutes\nPass - An estimate of the player's setup passes (passes that result in a shot attempt)\nPct% - Percentage of all events produced by this team, defaults to Corsi, but can be set to another stat\nPDO - The team's shooting and save percentages added together, times a thousand\nPEND - The team's penalties drawn while this player was on the ice\nPENT - The team's penalties taken while this player was on the ice\nPIM - Penalties in minutes\nPosition - Positions played. NHL source listed first, followed by those listed by any other source.\nPost - Times hit the post\nPr/St - Province or state of birth\nPS - Point shares, a catch-all stats that measures a player's contributions in points in the standings\nPSA - Penalty shot attempts\nPSG - Penalty shot goals\nPTS - Points. Goals plus all assists\nPTS/60 - Points per 60 minutes\nQRelCA60 - Shot attempts allowed per 60 minutes relative to how others did against the same competition\nQRelCF60 - Shot attempts per 60 minutes relative to how others did against the same competition\nQRelDFA60 - Weighted unblocked shot attempts (Dangeorus Fenwick) allowed per 60 minutes relative to how others did against the same competition\nQRelDFF60 - Weighted unblocked shot attempts (Dangeorus Fenwick) per 60 minutes relative to how others did against the same competition\nRBA - Rebounds allowed while this player was on the ice. Two very different sources.\nRBF - The team's rebounds while this player was on the ice. Two very different sources.\nRelA/60 - The player's A/60 relative to the team when he's not on the ice\nRelC/60 - Corsi differential per 60 minutes relative to his team\nRelC% - Corsi percentage relative to his team\nRelDf/60 - The player's Diff/60 relative to the team when he's not on the ice\nRelF/60 - The player's F/60 relative to the team when he's not on the ice\nRelF/60 - Fenwick differential per 60 minutes relative to his team\nRelF% - Fenwick percentage relative to his team\nRelPct% - The players Pct% relative to the team when he's not on the ice\nRelZS% - The player's zone start percentage when he's on the ice relative to when he's not.\nRopFO - Opening faceoffs taken at home\nRopFOW - Opening faceoffs won at home\nRSA - Shots off the rush allowed while this player was on the ice\nRSF - The team's shots off the rush while this player was on the ice\nS.Bkhd - Backhand shots\nS.Dflct - Deflections\nS.Slap - Slap shots\nS.Snap - Snap shots\nS.Tip - Tipped shots\nS.Wrap - Wraparound shots\nS.Wrst - Wrist shots\nSA - Shots on goal allowed while this player was on the ice\nSalary - The player's salary\nSCA - Scoring chances allowed while this player was on the ice\nSCF - The team's scoring chances while this player was on the ice\nsDist - The average shot distance of shots taken by this player\nSF - The team's shots on goal while this player was on the ice\nSH% - The team's (not individual's) shooting percentage when the player was on the ice\nSOG - Shootout Goals\nSOGDG - Game-deciding shootout goals\nSOS - Shootout Shots\nStatus - This player's free agency status\nSV% - The team's save percentage when the player was on the ice\nTeam -\nTKA - The team's takeaways while this player was on the ice\nTMCA60 - A weighted average of the shot attempts (Corsi, SAT) the team allowed per 60 minutes of a player's linemates\nTMCF60 - A weighted average of the shot attempts (Corsi, SAT) the team generated per 60 minutes of a player's linemates\nTMFA60 - A weighted average of the unblocked shot attempts (Fenwick, USAT) the team allowed per 60 minutes of a player's linemates\nTMFF60 - A weighted average of the unblocked shot attempts (Fenwick, USAT) the team generated per 60 minutes of a player's linemates\nTMGA60 - A weighted average of the goals the team allowed per 60 minutes of a player's linemates\nTMGF60 - A weighted average of the goals the team scored per 60 minutes of a player's linemates\nTMSA60 - A weighted average of the shots on goal the team allowed per 60 minutes of a player's linemates\nTMSF60 - A weighted average of the shots on goal the team generated per 60 minutes of a player's linemates\nTmxGF - A weighted average of a player's linemates of the expected goals the team scored\nTmxGA - A weighted average of a player's linemates of the expected goals the team allowed\nTMGA - A weighted average of a player's linemates of the goals the team scored\nTMGF - A weighted average of a player's linemates of the goals the team allowed\nTOI - Time on ice, in minutes, or in seconds (NHL)\nTOI.QoC - A weighted average of the TOI% of a player's opponents.\nTOI.QoT - A weighted average of the TOI% of a player's linemates.\nTOI/GP - Time on ice divided by games played\nTOI% - Percentage of all available ice time assigned to this player.\nWide - Shots that went wide of the net\nWt - Weight\nxGA - Expected goals allowed (weighted shots) while this player was on the ice, which is shot attempts weighted by location\nxGF - The team's expected goals (weighted shots) while this player was on the ice, which is shot attempts weighted by location\nxGF.QoC - A weighted average of the expected goal percentage of a player's opponents\nxGF.QoT - A weighted average of the expected goal percentage of a player's linemates\nZS% - Zone start percentage, the percentage of shifts started in the offensive zone, not counting neutral zone or on-the-fly changes",Topic 2 (178.449),"[player, team, shoot, goal, face, ice, attempt, use, weight, take]"


In [66]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 3')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
40,context vista category employmentbase nonimigrant vista temporary foreign worker united state foreign national apply isa employer must offer job submit petition isa immigration department also common isa status apply hold international student complete college high education begin work futile position follow article contain information isa process la file salary process time application process step step guide content dataset contain five year worth petition datum approximately million record overall column dataset include case status employer name worksite coordinate job title prevail wage occupation code year file information individual column refer column metadata detailed description underlie raw dataset available official datum dictionary acknowledgment office foreign labor mortification generate program datum include datum vista disclosure datum update annually available online raw datum available mess immediately suitable analysis set datum transformation perform make datum accessible quick exploration learn refer blow post complimentary r notebook inspiration number petition datum engineer job title increase time part hardware engineer job industry number datum scientist position employer file petition year,Topic 3 (1.052),"[inter, agree, interested, enjoy, much, categorical, always, music, people, use]"
731,context dataset list people involve accident city barcelona spain year till datum manage police city barcelona include several information describe content dataset compose file one contain k line every row contain several information like type injury slightly wound serious injury death include description person driver passenger pedestrian sex age location etc important dataset unload possible datum row misingnot correct description column number expedient case file number code district district code accident barcelona divide several district nom district name district code bare hood code accident every district barcelona several good nom bare name hood code carer street code every street code nom carer name street sum postal caution postal number street description seaman day week text write caravan seaman shortcode previous field also caravan description tip description type day labor festive also caravan number year I de number month nom I name month caravan de I day month description torn type round police mate morning tardy evening night horn de hour day description cause vacant text caravan describe accident case victim pedestrian say es cause del vacant desk tip vehicle implicate type vehicle accident also caravan description sexe sex victim home mean man mean woman description tip person type role accident describe victim pilot conductor passenger pasatger pedestrian vacant eat age victim description victimitzacio type injury caravan slightly wound merit let serious injury merit grow death mort cordenada tm tm coordinate cordenada tm x tm coordinate x see column could remove would loose information experience work file tell I row correct datum datum careful acknowledgment datum find open datum ban barcelona city hall open datum service owner file inspiration I unload information I believe datum share everybody research share also I always happy get feedback help,Topic 3 (1.533),"[inter, agree, interested, enjoy, much, categorical, always, music, people, use]"
2034,dataset description yet,Topic 3 (0.016),"[inter, agree, interested, enjoy, much, categorical, always, music, people, use]"
1131,context datum parliamentary agenda action take verkhovna ukrainian parliament content period th nov th oct list deputy list parliamentary fraction session day daily agenda result include total vote result individual deputy vote speech author tiding full text registration perform session day start acknowledgment source htpdataradagovuaopendatapzskl ukrainian government open datum portal thank everyone make open datum possible photo ilya cherednychenko splash inspiration ukranian parliamentary faction structure parliamentary activity dataset reflect go political event country,Topic 3 (0.178),"[inter, agree, interested, enjoy, much, categorical, always, music, people, use]"
384,context dataset match include individual statistic content dataset individual file tournament number last column absolute value use calculate percentage dataset legend match statistic absolute number format convert percentage use total point number ace absolute number face number double fault sept total serve point skin st serve st win point win st serve point win nd serve seem serve game save break point save place break point face acknowledgment thank je parkman excellent work sure visit github profile htpsgithubcomjefsackmantenis_atp inspiration dataset would likely use develop prediction model tennis match statistic research I plan add historical odd injury datum soon I time get,Topic 3 (0.354),"[inter, agree, interested, enjoy, much, categorical, always, music, people, use]"


In [67]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 4')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
1858,context universal taste petrov et al create facilitate future research supervise induction syntactic structure standardize bestpractice update documentation find htpuniversaldependenciesorgupos content file depositor contain mapping trepak specific taste set universal partofspech tag universal tag verb verb tense mode noun nous common proper iron profound adj adjective adv adverse add position proposition postposition cone conjunction meet determine sum cardinal number part particle function word x foreign word type abbreviation punctuation see universal partofspech taste slav petrov dipanjan das run macdonald detail htparxivorgab zipfile contain langtagsetmap file map respective taste pus taste long universal taste eg enptbmap contain mapping english penn tree bank taste universal taste list mapping include arpadtmap bgbtbmap cacatlbmap cspdtmap dadtmap denegramap detigermap elgdtmap enbrownmap enptbmap entwetmap escastlbmap eseaglesmap esiulamap estretagermap eueuslbmap fitdtmap frparismap huszegedmap itistmap iwmilamap jakyotomap javerbmobilmap kosejongmap nlalpinomap plipipanmap ptbosquemap rurncmap slsdtmap svtalbankenmap tumetusbancimap zhctbmap zhsinicamap additional contain ready ready file universal_tagspy script use convert tag universal taste use mapping schneider entwetreadme description sweet tag mapping noah et al situation slav petrov dipanjan das run macdonald universal partofspech taste rec,Topic 4 (2.487),"[year, contain, file, element, age, child, tag, time, numerical, total]"
2043,context automatic composition computer program stand challenge since early day artificial intelligence clear solution modernday research deep learn indeed model latent context representation language prove difficult task singularly combine apply structure procedure knowledge generative fashion quickly become intractable complex domain specific language clear need research devise efficiently learn combination independent problem present basic dataset elementary mathematical function end patron program language encourage future research field benchmark deep learn content dataset contain total elementary mathematical function example input correspond output first line dataset file contain header describe content column first column label function_name follow row contain function name unique inter index next twenty column contain function_input_x function_output_x integer exclusively follow row contain stre end patron float point number execute correspond function final column label function_code contain singleline labia statement elementary mathematical function acknowledgment thank guidance student instructor machine learn berkeley research group discussion member redwood institute theoretical neuroscience inspiration hope see neutral architecture capable program new computer code without human supervision program language solve even scientific compute task license copyright c brandon trabuco permission hereby grant free charge person obtain copy software associate documentation file software deal software without restriction include without limitation right use copy modify merge publish distribute sublicense ardor sell copy software permit person software furnish subject follow condition copyright notice permission notice shall include copy substantial portion software software provide without warranty kind express imply include limited warranty merchantability fitness particular purpose noninfringement event shall author copyright holder liable claim damage liability whether action contract sort otherwise arise connection software use dealing software,Topic 4 (2.453),"[year, contain, file, element, age, child, tag, time, numerical, total]"
1105,context I scrape currently available urban dictionary page content word slang term add urban dictionary definition definition say term author user account contribute term tag list hashtag use vote downvote date date term add urban dictionary acknowledgment I would like thank good friend nail give idea scrape term,Topic 4 (0.370),"[year, contain, file, element, age, child, tag, time, numerical, total]"
1810,context conical metadata package idwebtext nameweb text corpus,Topic 4 (0.148),"[year, contain, file, element, age, child, tag, time, numerical, total]"
1245,context london fire rescue service business england one large firefighting rescue organization world aftermath grenfel tower fire critical firefighting resource accurately appropriately deploy content datum cover jan april consist column contain information time type address call well home station stay duration arrival time attend pump acknowledgment dataset comply city london use kersey analyze share discuss datum eagle look realize update big datum check datum bigquery inspiration borough short average call response long borough great volume call,Topic 4 (0.462),"[year, contain, file, element, age, child, tag, time, numerical, total]"


In [68]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 5')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
1473,code federal regulation cf modification general permanent rule regulation sometimes call administrative law publish federal register executive department agency federal government united state subject matter title contain one individual volume update calendar year stagger basis annual update cycle follow title revise january title revise april title revise july title revise october title divide chapter usually bear name issue agency chapter far undivide part cover specific regulatory area large part may undivide support part organize section situation cf refer material section level cf publish multiple format government publish office find late version format htpwgpogovfdsysbulkdatacfr,Topic 5 (0.530),"[numerical, text, number, element, use, date, time, zero, occur, end]"
417,content report list failure commercial bank saving association saving bank since establishment record include institution name number institution charter type location headquarter city state effective date insurance fund certificate number failure transaction type total deposit total asset last report prior failure thousand dollar estimate cost resolution datum estimate loss available insure failure prior slip insure failure acknowledgment bank failure report download webster inspiration type bank institution likely fail bank failure rate change time commercial bank failure cost federal government resolve,Topic 5 (0.849),"[numerical, text, number, element, use, date, time, zero, occur, end]"
1409,world citizen treat equally one way might think measure inequality value person citizenship number country freely travel datum set contain information global reach national passport country datum identify number country passport grant isa free travel isa arrival number country passport welcome destination country datum download htpswpasportindexorgbyindividualrankphp april use follow r scrape package htpsgithubcomsdoriuspasportr information variable visarank passport index rank originally label global rank visage number country passport allow visage travel originally label individual rank vision number country passport allow visaonarival originally label individual rank visawelc number passport accept travel destination country originally label welcome rank,Topic 5 (0.701),"[numerical, text, number, element, use, date, time, zero, occur, end]"
1505,context dataset represent register unemployment city barcelona spain year till register unemployment correspond job demand pende cover last day month exclude employee want change job one readily available incompatible situation one ask specific occupation temporary agricultural beneficiary special unemployment benefit content file dataset format every row represent hood city district number hood name number citizen hood age legal age job column one per month unemployment barcelona good district every hood belong district district form several good acknowledgment datum find open datum ban barcelona city hall open datum service owner file inspiration week ago I need dataset testing purpose I unload information honest opinion datum research share everybody enjoy,Topic 5 (0.792),"[numerical, text, number, element, use, date, time, zero, occur, end]"
1395,context war one thing human specie proud recent time major political leader take step increase decrease tension ally rival part datum science community I believe field explore much affect era comfort seem delusion visualize try predict close next great war might content version dataset small initial version death toll timeline major war infer participate country much possible name war analyse death toll risefal recent year file war column name war name time time period war include start end year long war casualty number death war subsequent version good response would include weapon use participate country entry acknowledgment datum scrape wikipedia manual clean inspiration I hope dataset start butterfly effect lead uprise war general,Topic 5 (1.258),"[numerical, text, number, element, use, date, time, zero, occur, end]"


In [73]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 6')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
420,context human many animal ability reduce suppress brain response sensory consequence result action nervous system accomplish corollary discharge forward model system reference copy impend motor plan transmit motor sensory cortex generate corollary discharge representation expect sensory consequence imminent motor act example move eye leave right brain know environment shift speak auditory cortex reduce response expect sound voice schizophrenia chronic mental illness affect people across globe one possible explanation symptom schizophrenia one problem corollary discharge process nervous system make difficult patient differentiate internally externally generate stimulus therefore study process relationship symptom illness might allow well understand abnormal brain process patient diagnosis previously publish leg experiment full report use simple button press task subject either press button immediately generate tone passively listen tone press button without generation tone study corollary discharge people schizophrenia comparison control find comparison control suppress n negative deflection leg brain wave millisecond onset sound press button generate tone compare passive playback patient schizophrenia datum set large sample application previous study specifically leg datum control patient schizophrenia combine control patient previous report method due size raw leg datum preprocese prior unload leg datum acquisition parameter experimental task identical describe paper however preprocese differ individual subject datum least follow datum process step apply order reference average ear love highway filter interposition outer channel continuous leg datum outline define paper chop continuous datum single trial epochs second task event total vaseline correction ms conical correlation analysis remove muscle highfrequency white noise artifact rejection outer single trial outline define paper removal outer component spatial independent component analysis outline define paper interposition outer channel within single trial outline define paper derive datum include eventrelate potential rep average electrode site analyze previous report include ff fez c ff ff c c picture eye calculate average across trial every sample time series separately subject electrode condition content single trial datum channel large unload subject interested type datum find one subject subject among datum file include datum preprocese step list interested compare patient schizophrenia control subject erpdatacsv file contain average rep time series subject condition electrode mention datum along subject information demographicsv could use delicate analysis prior report interested single trial categorizationprediction like graspandlift challenge face decide challenge mergedtrialdatacsv contain summary measurement nearly individual trial subject condition include acknowledgment funding study procedure initial analysis publication come national institute mental health please see grant additional detail cite nigh project number rah work relate datum study participant give write inform consent participate study receive institutional review board approval,Topic 6 (2.086),"[numerical, text, number, year, use, image, element, age, file, date]"
4,context bitcoin long run well know cryptocurency first release open source anonymous satoshi nakamoto bitcoin serve centralized medium digital exchange transaction verify record public distribute ledger blockchain without need trust record keep authority central intermediary transaction block contain cryptographic previous transaction block thus chain together serve immutable record transaction ever occur curencycomodity market bitcoin trading financial instrument soon follow public adoption bitcoin continue grow include historical bitcoin market datum min interval select bitcoin exchange trading take place happy datum mining content coincheckjpy_min_data_to_csv bitflyerjpy_min_data_to_csv coinbaseusd_min_data_to_csv bitstampusd_min_data_to_csv file select bitcoin exchange time period jan jan minute minute update open high low close volume etc indicate currency weight bitcoin price timestamp unit time timestamp without trade activity datum field populated dan timestamp miss jump may exchange apt exchange apt exist unforeseen technical error datum report gather effort make duplicate entry verify content correct complete well ability obviously trust risk acknowledgment inspiration various exchange axis make difficult unintuitive enough get volume datum min interval I set datum scrape project satoshi nakamoto novel core concept blockchain well first execution via bitcoin protocol I would also like thank viewer like wait see code insight share I lowly pad student fun meager spare time find datum interesting spare coffee fuel science send way I would immensely grateful kmwmcqaqnzrdgfdkwehkbgugkbrcf,Topic 6 (0.715),"[numerical, text, number, year, use, image, element, age, file, date]"
423,context dataset create college course work opportunity test deep learn capability computer vision restrict problem idea explore classification problem single coin repression problem group coin try count much money sum see initial approach content two dataset one classification another repression first contain image single coin second contain first one another image two coin present example classification problem five class cent repression example cent every file contain value money filename example pg contain single cent coin one coin coin value pg go find enough coin sum cent example pg different coin type use make interesting finger appear image I try keep distance illumination background constant difference noticed specially illumination change coin position great impact light reflect structure use take picture fact second light source add inspiration model sum coin tell much money group coin could use people vision disability deep learn count classify sum single model split problem pigmentation classification sum amount datum achieve good generalization predict sum beyond dataset great value situation want use dataset purpose contemplate license add reference money l brazilian coin dataset retrieve htplgmonedagithubio acknowledgment I would like thank guiana hard rachel de souza group college course generate dataset,Topic 6 (2.143),"[numerical, text, number, year, use, image, element, age, file, date]"
1310,context datum set include office asesorecorder secure property tax roll span include legally disclosable information include location property value property unique property identify specific property characteristic datum use accurately fairly appraise table property city county san francisco office asesorecorder make representation warranty information provide accurate ardor error omission potential question get start effect prop see historic property tax roll field field dataset full datum dictionary find include follow commonly use geographic shapefile analysis neighborhood supervisor district april acknowledgment datum provide san francisco office asesorecorder via san francisco open datum portal htpsdatasfgovorgdwvmvpq pal odd public domain education licence pal photo flick via rebecca morgan c byncsa,Topic 6 (0.950),"[numerical, text, number, year, use, image, element, age, file, date]"
769,context dataset contain energy usage information every build manage ny das das department citywide administrative service arm new york city municipal government candle ownership management city office facility real estate inventor organization voluntarily publicly disclose selfmeasure information energy use building content datum contain information name address location financial cycle energy usage every build manage time das acknowledgment dataset publish basis city new york inspiration combine dataset new york city building database learn energy usage building new york city use datum model energy consumption city office space large,Topic 6 (0.620),"[numerical, text, number, year, use, image, element, age, file, date]"


In [75]:
final_df[final_df['Dominant_Topic'].str.contains('Topic 7')][['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].sample(5)

Unnamed: 0,Cleaned_Description,Dominant_Topic,Topic_Keywords
695,context referendum hold june decide whether united kingdom remain member european union leave approximately million people vote leave e referendum trout million vote cast content electoral commission publish result e referendum district region vote office national statistic provide population demographic district united kingdom census,Topic 7 (0.034),"[human, de, run, john, year, file, child, age, andrew, paul]"
43,dataset contain list video game sale great copy generate scrape vgchartzcom field include rank bank overall sale name game name platform platform game release pip etc year year game release gene gene game publisher publisher game na_sale sale north america million eu_sale sale europe million jp_sale sale japan million other_sale sale rest world million global_sale total worldwide sale script scrape datum available htpsgithubcomgregorutvgchartzscrape base beautifulsoup use patron record record drop due incomplete information,Topic 7 (1.293),"[human, de, run, john, year, file, child, age, andrew, paul]"
1381,context datum set extract fbiucr webster year population less content list variable population violent_crime_total murder_and_manslaughter forcible_rape robbery aggravated_assault property_crime_total burglar larceny_theft motor_vehicle_theft long acknowledgment I really appreciate fbiucr generosity inspiration impact population crime,Topic 7 (0.469),"[human, de, run, john, year, file, child, age, andrew, paul]"
1045,context thousand cryptocurencie spring past year predict one next etc content dataset contain daily open high low close trading volume cryptocurencie exclude bitcoin acknowledgment htpstimescaledatablobcorewindowsnetdatasetscrypto_datatargz inspiration speculative force always work cryptocurency exchange contain statistical significant feature,Topic 7 (0.359),"[human, de, run, john, year, file, child, age, andrew, paul]"
1767,datum acquire precede bent monthly sale report price average ak stick price stand manufacturer suggest retail price,Topic 7 (0.170),"[human, de, run, john, year, file, child, age, andrew, paul]"
