In [None]:
import pandas as pd
from matplotlib import pyplot as plt


In [None]:
dataset = pd.read_csv('../data/bq-results-20200205-092131-ioej0ewh4vcc.csv')
dataset.head()

In [None]:
def filter_tag_position(position):
    def filter_function(df):
        return df.loc[df.tag_position == position]
    return filter_function

def filter_tags_with_less_than_x_samples(x):
    def filter_function(df):
        tag_counts = df.groupby('tag_name')['post_id'].nunique()
        tags_with_at_least_10 = tag_counts[tag_counts >= x].index.values
        return df.loc[df.tag_name.isin(tags_with_at_least_10)]
    return filter_function

processed_dataset = dataset.copy()\
    .pipe(filter_tag_position(0))\
    .pipe(filter_tags_with_less_than_x_samples(100))

printprocessed_dataset.shape[0], processed_dataset['tag_name'].nunique()

In [None]:
# train, test, validation split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

x_train, x_non_train, y_train, y_non_train = train_test_split(
    processed_dataset['title'], processed_dataset['tag_name'], 
    train_size=0.8, 
    stratify=processed_dataset['tag_name'])

x_test, x_val, y_test, y_val = train_test_split(
    x_non_train, y_non_train, 
    train_size=0.5,
    stratify=y_non_train)

pipeline = Pipeline([
    ('feature_extraction', HashingVectorizer()),
    ('model', SGDClassifier())
])

model = pipeline.fit(x_train, y_train)

y_test_hat = model.predict(x_test)

print(classification_report(y_test_hat, y_test))