In [2]:
import thesis_utils as tu
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import seaborn as sns
from analysis import data_prep
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.preprocessing import LabelEncoder

In [None]:
data = tu.Dataset()
td_df = data.translated_data
sd_df = data.site_data

In [None]:
m_df = pd.merge(td_df[['site_data_id', 'original_language']], sd_df[['id', 'category','origin']], left_on='site_data_id', right_on='id')
original = m_df[m_df['origin']=='original']
additional = m_df[m_df['origin']=='additional']


In [None]:
X, y, vectorizer = data_prep(data)
le = LabelEncoder()
y = le.fit_transform(y)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

models = [
    LinearSVC(),
    MLPClassifier(random_state=1, max_iter=500, hidden_layer_sizes=1000),
    SVC(kernel='precomputed'),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    LogisticRegression(random_state=0),
    MultinomialNB(),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    if model_name == 'SVC':
        train_X_cosine = pairwise_kernels(train_X, metric='cosine')
        accuracies = cross_val_score(model, train_X_cosine, train_y, scoring='accuracy', cv=CV)
    else:
        accuracies = cross_val_score(model, train_X, train_y, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['Classifier', 'fold_idx', 'Accuracy'])
sns.boxplot(x='Classifier', y='Accuracy', data=cv_df)
# sns.stripplot(x='Classifier', y='Accuracy', data=cv_df, 
#         size=5, jitter=True, edgecolor="gray", linewidth=1)
# change sns x axis labels to be more readable
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('plot_explore_classifiers.pdf')

In [None]:
print(cv_df.groupby('model_name').accuracy.mean())
# data_df = pd.merge(td_df[['site_data_id', 'original_language']], sd_df[['id', 'category', 'origin']], left_on='site_data_id', right_on='id')
# filtered_df.groupby('category').count()
# original['category'].value_counts()

In [None]:
original = m_df[(m_df['origin'] == 'original') & (m_df['original_language'] == 'en')]
categories = m_df['category'].unique()

fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 1
opacity = 0.5

df1_c = original['category'].value_counts()
# Add missing categories and fill with zeros
missing_categories = set(categories) - set(df1_c.index)
for category in missing_categories:
    df1_c = df1_c.append(pd.Series([0], index=[category]))
# Sort by category
df1_c = df1_c.sort_index()
category_labels = sorted(categories, key=lambda s: s.split()[0])
ax.bar(df1_c.index, df1_c.values, bar_width, alpha=opacity, color='b', label='Original Data')
ax.set_xlabel('Categories')
ax.set_ylabel('Count')
ax.set_xticks(np.arange(len(categories)))
ax.set_xticklabels(category_labels, rotation=90, fontsize=8)
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

# df1_c shows counts for the plot sum is 275

In [None]:
# print(df1_c['category'].value_counts())
# print(df1_c['category'].value_counts().sum())
df1_c
df1_c.sum()