In [7]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import ParameterSampler
from sklearn.feature_extraction.text import CountVectorizer
from trainer import GridSearchCVTrainer
import numpy as np
from scipy.sparse import hstack

# Import and remove NaN value
project_name = 'titanium'

data_train = pd.concat([pd.read_csv(project_name + '/' + project_name + '_dataset_train.csv'),
                       pd.read_csv(project_name + '/' + project_name + '_dataset_valid.csv')])
data_test = pd.read_csv(project_name + '/' + project_name + '_dataset_test.csv')

data_train['description'].replace(np.nan, '', inplace=True)
data_test['description'].replace(np.nan, '', inplace=True)

# Vectorize title
title_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
title_vectorizer.fit(pd.concat([data_train['title'], data_test['title']]))

# Vectorize description
description_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
description_vectorizer.fit(pd.concat([data_train['description'], data_test['description']]))

X_train = hstack([title_vectorizer.transform(data_train['title']).astype(float),
                  description_vectorizer.transform(data_train['description']).astype(float),
                  data_train['title'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1),
                  data_train['description'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1)])

y_train = data_train['storypoint'].to_numpy().astype(float)

X_test = hstack([title_vectorizer.transform(data_test['title']).astype(float),
                  description_vectorizer.transform(data_test['description']).astype(float),
                  data_test['title'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1),
                  data_test['description'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1)])

y_test = data_test['storypoint'].to_numpy().astype(float)

alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]
params = {'alpha': alphas}

trainer = GridSearchCVTrainer("Ridge", Ridge(), params)

trainer.fit(X_train, y_train)

Training combination 1/9
