# In this script, you can find how to extract features (topics)

# Import required packages

In [None]:
import pandas as pd
import numpy as np
import math
from google.colab import drive, files
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
drive.mount('/content/drive/', force_remount=True)


Mounted at /content/drive/


In [None]:
# "dataset_dir" is the folder which has the Train and Test datasets
# you should create this folder uploading datasets in the repo.
dataset_dir = '/content/drive/My Drive/peak/dataset_dir'

df_train = pd.read_csv(dataset_dir +"/df_train.csv")
df_test = pd.read_csv(dataset_dir +"/df_test.csv")


# Topic Modeling using Non-negative Matrix Factorization (NMF) <br>
\begin{equation}
X \approx W \times H
\end{equation}
<BR>
X: (image, tag), W: (image, topic), H: (topic, tag) matrices <br>
[sklearn - NMF](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html) <br>
# Term Frequency-Inverse Document Frequency (TF-IDF) <br>
**Term Frequency:** the number of times a tag appears in an image. <br>
**Inverse Document Frequency:** total images in Training data over number of images with tag. <br>
[sklearn TF-IDF](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) <br>

In [None]:
# Use TF-IDF to weight tags
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X = tfidf_vectorizer.fit_transform(df_train.cleaned_tags).toarray()
print("TFIDF's features: ", tfidf_vectorizer.get_feature_names_out())


TFIDF's features:  ['abandoned' 'abbey' 'abdomen' ... 'zoo' 'zoology' 'zoom']


In [None]:
number_of_topics = 20
nmf_model = NMF(n_components=number_of_topics, init='random', random_state=5)
nmf_features = nmf_model.fit_transform(X)




In [None]:
# Show tags in the Training data with topic-association
components_df = pd.DataFrame(nmf_model.components_, columns = tfidf_vectorizer.get_feature_names_out())
components_df


Unnamed: 0,abandoned,abbey,abdomen,abend,aboriginal,absorber,abstract,abstraction,abundance,academia,...,young,youth,zebra,zen,zion,zip_up,zombie,zoo,zoology,zoom
0,0.0,0.0,0.0,0.000293,4.2e-05,0.0,0.0,0.000708,0.0,0.0,...,0.0,0.0,0.0,0.001678,0.000372,0.0,0.0,0.0,0.00078,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0001251219,0.0,0.001893,0.0,0.0,...,0.033148,0.0,0.0,0.0,0.0,0.000278,0.0,0.0,0.0,0.000561
2,0.128161,0.006897,0.0,0.0,0.0,0.0,0.0,0.000622,0.0,0.000256,...,0.0,0.0,0.0,0.000571,0.0,3e-06,0.0,0.0,0.0,0.0
3,0.019752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000174,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.357065,0.0,0.000868,0.0,0.0,0.0,0.0,0.079168,0.005091,0.0
5,0.011062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000541,0.0,0.000547,0.0,0.0,0.047701,0.002408,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000112,0.0058,0.0,...,0.018059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9e-05,0.02075
7,0.0,0.0,0.0,0.001695,0.001386,0.0,0.0,0.0,0.00368,0.0,...,0.0,0.0,0.0,0.001056,0.0,0.0,0.001347,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.008706,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.033212,0.0,0.004537,0.000159,0.0,0.0,0.124273,0.000459,0.0,0.0,...,0.0,0.002936,0.0,0.0,0.0,0.0,0.018919,0.0,0.0,0.012272


In [None]:
# Show the top 20 most associated tags with topics
individual_topic_dict = {}
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    individual_topic_dict[topic] = list(tmp.nlargest(20).keys())

df_train_topic_tag = pd.DataFrame.from_dict(individual_topic_dict)
df_train_topic_tag


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,sea,recreation,architecture,room,mammal,park,business,group,concert,monochrome,wear,flower,transportation_system,baby,togetherness,administration,design,sunset,snow,fashion
1,beach,competition,building,furniture,pet,nature,technology,festival,performance,dark,man,flora,vehicle,son,enjoyment,election,desktop,dawn,cold,pretty
2,water,leisure,city,indoors,animal,wood,text,religion,musician,nude,portrait,leaf,road,child,friendship,leader,art,evening,winter,model
3,ocean,motion,urban,seat,dog,tree,internet,battle,singer,model,adult,garden,car,little,family,league,illustration,sun,ice,sexy
4,seashore,fun,travel,table,cute,outdoors,computer,crowd,band,face,facial_expression,color,street,innocence,facial_expression,meeting,retro,dusk,frost,glamour
5,travel,lifestyle,street,chair,canine,landscape,paper,street,stage,black_and_white,music,bright,wheel,toddler,child,politician,abstract,light,weather,beautiful
6,sky,adult,house,interior_design,looking,summer,indoors,offense,music,studio,woman,floral,traffic,fun,happiness,education,pattern,sky,frozen,young
7,lake,child,outdoors,family,fur,environment,telephone,drag_race,festival,art,people,closeup,action,cute,love,banking,decoration,silhouette,tree,girl
8,sand,outdoors,old,window,domestic,grass,industry,man,guitar,profile,indoors,growth,travel,toy,adolescent,school,texture,landscape,season,hair
9,seascape,action,town,house,sit,fall,facts,police,instrument,eye,outfit,summer,drive,girl,woman,business,vintage,backlit,snowstorm,woman


# Predict Topic Distributions of Test Data

In [None]:
# Transform the TF-IDF
X_test = tfidf_vectorizer.transform(df_test.cleaned_tags)
nmf_features_test = nmf_model.transform(X_test)
pd.DataFrame(nmf_features_test)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.000000,0.000000,0.003894,0.004360,0.000000,0.021853,0.008524,0.000000,0.000000,0.000000,0.077317,0.000000,0.000000,0.000000,0.151370,0.005432,0.000000,0.000000,0.000000,0.003094
1,0.000000,0.000000,0.000000,0.000701,0.000000,0.000000,0.000000,0.008343,0.000000,0.000000,0.076507,0.000000,0.000000,0.000000,0.147227,0.056369,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.001332,0.021806,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016424,0.111895,0.000000,0.117735,0.044491,0.000000,0.000000,0.000000,0.004922,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050547,0.105778,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022150
4,0.000323,0.000000,0.000000,0.048821,0.000072,0.000000,0.000000,0.000000,0.000000,0.000000,0.028621,0.000000,0.000000,0.006318,0.000000,0.000000,0.039204,0.000000,0.000415,0.005294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.000000,0.000000,0.000000,0.000000,0.000835,0.002024,0.000932,0.000000,0.000000,0.000000,0.000000,0.002136,0.168007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4996,0.000000,0.000000,0.000000,0.000000,0.000000,0.163886,0.000000,0.000000,0.000000,0.000000,0.000000,0.109700,0.000000,0.000000,0.000000,0.000000,0.000000,0.028264,0.014342,0.000000
4997,0.068881,0.000000,0.087164,0.000000,0.000000,0.000873,0.040323,0.014768,0.000000,0.000000,0.000000,0.000000,0.085690,0.000280,0.000000,0.004548,0.000000,0.000000,0.000000,0.000037
4998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028616,0.000000,0.026094,0.000000,0.000000,0.006354,0.000000,0.068457,0.148388,0.000000,0.000000


In [None]:
# Prepare input files for the training and the test
train_input = pd.DataFrame(nmf_features).to_numpy()
test_input = pd.DataFrame(nmf_features_test).to_numpy()

train_label = np.array(df_train.normalizedpublic)
train_label = np.array(list(map(lambda x: math.floor(x), train_label)))
test_label = np.array(df_test.normalizedpublic)

train_input.shape, test_input.shape, train_label.shape, test_label.shape


((27000, 20), (5000, 20), (27000,), (5000,))