In [2]:
import pandas as pd
import numpy as np
import logging
import json
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
# We will import the Json train and test files.

In [4]:
# Opening JSON file
f = "/Users/Alejandro/Desktop/MASTER/Machine learning 23/train.json"   # "/Users/u651278/Downloads/train.json"  

with open(f, 'r') as file:
    json_data = json.load(file)


# we convert it into a pd dataframe
train = pd.DataFrame(json_data).fillna("")
train.head(10)

train.tail(10)

# We visualize some lines of the training data set in a pandas dataframe to have an insight.

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
65904,inproceedings,Coreference in Knowledge Editing,,1998,,"[van Deemter, Kees, Power, Richard]",
65905,proceedings,COLING 1965,,1965,,,
65906,inproceedings,Precision-focused Textual Inference,,2007,Association for Computational Linguistics,"[Bobrow, Daniel, Crouch, Dick, King, Tracy Hol...",
65907,inproceedings,Generative Knowledge Graph Construction: A Review,,2022,Association for Computational Linguistics,"[Ye, Hongbin, Zhang, Ningyu, Chen, Hui, Chen, ...",Generative Knowledge Graph Construction (KGC) ...
65908,inproceedings,Mind the Gap: Data Enrichment in Dependency Pa...,,2018,Association for Computational Linguistics,"[Droganova, Kira, Ginter, Filip, Kanerva, Jenn...","In this paper, we focus on parsing rare and no..."
65909,inproceedings,Optimizing the weighted sequence alignment alg...,,2022,Association for Computational Linguistics,"[Janicki, Maciej]",We present an optimized implementation of the ...
65910,proceedings,Proceedings of the 25th Conference on Computat...,"[Bisazza, Arianna, Abend, Omri]",2021,Association for Computational Linguistics,,
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,,2014,MIT Press,"[Pilehvar, Mohammad Taher, Navigli, Roberto]",
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,,2016,,"[Li, Lei, Mao, Liyuan, Zhang, Yazhao, Chi, Jun...",
65913,inproceedings,Ontology Engineering and Knowledge Extraction ...,,2009,Association for Computational Linguistics,"[Trapman, Jantine, Monachesi, Paola]",


In [8]:
# We calulate summary statistics to know more about our predicted variable

year = train['year']
# train['year'] = pd.to_numeric(train['year'], errors='coerce')
print(train['year'].dtype)


summary_stats = year.describe()
print(summary_stats)

mean_value = year.mean()

std_dev = year.std()

se = std_dev / np.sqrt(len(year))

max_value = year.max()
min_value = year.min()

q1 = year.quantile(0.25)
median_value = year.median()
q3 = year.quantile(0.75)

# Display the results
print("Mean:", mean_value)
print("Standard Deviation:", std_dev)
print("Standard Error:", se)
print("Minimum:", min_value)
print("Maximum:", max_value)
print("25th Percentile (Q1):", q1)
print("Median (50th Percentile):", median_value)
print("75th Percentile (Q3):", q3)

# This was to check if the algo was working compare to the std.

int64
count    65914.000000
mean      2012.520587
std         10.237947
min       1952.000000
25%       2008.000000
50%       2016.000000
75%       2020.000000
max       2023.000000
Name: year, dtype: float64
Mean: 2012.5205874321084
Standard Deviation: 10.237947472227505
Standard Error: 0.039877145520941515
Minimum: 1952
Maximum: 2023
25th Percentile (Q1): 2008.0
Median (50th Percentile): 2016.0
75th Percentile (Q3): 2020.0


In [11]:
print(train.columns) # We check the name of the columns in the environment and then check because there seem to be many missing information

column_name = 'abstract'  
if column_name in train.columns:
    empty_string_count = (train[column_name] == '').sum()
    print(f"Number of empty string cells in '{column_name}': {empty_string_count}")
else:
    print(f"Column '{column_name}' not found in the DataFrame.")



Index(['ENTRYTYPE', 'title', 'editor', 'year', 'publisher', 'author',
       'abstract'],
      dtype='object')
Number of empty string cells in 'abstract': 33642


In [12]:
# Now we check the dimensions of the pandas data frame because it looks like a problem of missing data in some columns.

print(train.shape) 
print()
print("Percentage of column 'abstract' missing values:", round((33642*100)/65914, 2),"%")

# There is more than 50% of missing data for the abstract column

(65914, 7)

Percentage of column 'abstract' missing values: 51.04 %


In [13]:
#Now we do the same for the rest of the columns

column_name = 'editor'  # Replace with your column name
if column_name in train.columns:
    empty_string_count = (train[column_name] == '').sum()
    print(f"Number of empty string cells in '{column_name}': {empty_string_count}")
else:
    print(f"Column '{column_name}' not found in the DataFrame.")
print("Percentage of column 'editor' missing values:", round((64438*100)/65914, 2),"%")


print()
column_name = 'publisher'  # Replace with your column name
if column_name in train.columns:
    empty_string_count = (train[column_name] == '').sum()
    print(f"Number of empty string cells in '{column_name}': {empty_string_count}")
else:
    print(f"Column '{column_name}' not found in the DataFrame.")
print("Percentage of column 'publisher' missing values:", round((8201*100)/65914, 2),"%")


print()
column_name = 'author'  # Replace with your column name
if column_name in train.columns:
    empty_string_count = (train[column_name] == '').sum()
    print(f"Number of empty string cells in '{column_name}': {empty_string_count}")
else:
    print(f"Column '{column_name}' not found in the DataFrame.")
print("Percentage of column 'author' missing values:", round((2399*100)/65914, 2),"%")

# Check how round function was used.


Number of empty string cells in 'editor': 64438
Percentage of column 'editor' missing values: 97.76 %

Number of empty string cells in 'publisher': 8201
Percentage of column 'publisher' missing values: 12.44 %

Number of empty string cells in 'author': 2399
Percentage of column 'author' missing values: 3.64 %


In [14]:
# There so many unavailable data for the 'editor' column, that we delete it from our pd data frame.

column_to_remove = 'editor'
train = train.drop(column_to_remove, axis=1)

pd.DataFrame(train)

Unnamed: 0,ENTRYTYPE,title,year,publisher,author,abstract
0,inproceedings,Philippine Language Resources: Trends and Dire...,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N...",
1,inproceedings,A System for Translating Locative Prepositions...,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]",
2,inproceedings,Introduction to the Shared Task on Comparing S...,2008,College Publications,"[Bos, Johan]",
3,inproceedings,Pynini: A Python library for weighted finite-s...,2016,Association for Computational Linguistics,"[Gorman, Kyle]",
4,inproceedings,Improving Readability of Swedish Electronic He...,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,...",
...,...,...,...,...,...,...
65909,inproceedings,Optimizing the weighted sequence alignment alg...,2022,Association for Computational Linguistics,"[Janicki, Maciej]",We present an optimized implementation of the ...
65910,proceedings,Proceedings of the 25th Conference on Computat...,2021,Association for Computational Linguistics,,
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,MIT Press,"[Pilehvar, Mohammad Taher, Navigli, Roberto]",
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,2016,,"[Li, Lei, Mao, Liyuan, Zhang, Yazhao, Chi, Jun...",


In [15]:
# Lets do the same with abstract to try to improve our MAE.

column_to_remove2 = 'abstract'
train = train.drop(column_to_remove2, axis=1)

pd.DataFrame(train)

Unnamed: 0,ENTRYTYPE,title,year,publisher,author
0,inproceedings,Philippine Language Resources: Trends and Dire...,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N..."
1,inproceedings,A System for Translating Locative Prepositions...,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]"
2,inproceedings,Introduction to the Shared Task on Comparing S...,2008,College Publications,"[Bos, Johan]"
3,inproceedings,Pynini: A Python library for weighted finite-s...,2016,Association for Computational Linguistics,"[Gorman, Kyle]"
4,inproceedings,Improving Readability of Swedish Electronic He...,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,..."
...,...,...,...,...,...
65909,inproceedings,Optimizing the weighted sequence alignment alg...,2022,Association for Computational Linguistics,"[Janicki, Maciej]"
65910,proceedings,Proceedings of the 25th Conference on Computat...,2021,Association for Computational Linguistics,
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,MIT Press,"[Pilehvar, Mohammad Taher, Navigli, Roberto]"
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,2016,,"[Li, Lei, Mao, Liyuan, Zhang, Yazhao, Chi, Jun..."


In [16]:
# Opening JSON file
f =  "/Users/Alejandro/Desktop/MASTER/Machine learning 23/test.json"  # "/Users/u651278/Downloads/test.json" 

with open(f, 'r') as file:
    json_data2 = json.load(file)

# we convert it into a pd dataframe
test = pd.DataFrame(json_data2).fillna("")
test.head(10)

# We some lines of the testing data set to have an insight, and we see there is the predictor column missing so check.

Unnamed: 0,ENTRYTYPE,title,editor,publisher,author,abstract
0,inproceedings,Learning to lemmatise Polish noun phrases,,Association for Computational Linguistics,"[Radziszewski, Adam]",
1,inproceedings,The Treebanked Conspiracy. Actors and Actions ...,,,"[Passarotti, Marco, González Saavedra, Berta]",
2,inproceedings,Linguistic structure and machine translation,,,"[Lamb, Sydney M.]",If one understands the nature of linguistic st...
3,inproceedings,NSEmo at EmoInt-2017: An Ensemble to Predict E...,,Association for Computational Linguistics,"[Madisetty, Sreekanth, Desarkar, Maunendra San...","In this paper, we describe a method to predict..."
4,inproceedings,Explaining data using causal Bayesian networks,,Association for Computational Linguistics,"[Sevilla, Jaime]",I introduce Causal Bayesian Networks as a form...
5,inproceedings,UnihanLM: Coarse-to-Fine Chinese-Japanese Lang...,,Association for Computational Linguistics,"[Xu, Canwen, Ge, Tao, Li, Chenliang, Wei, Furu]",Chinese and Japanese share many characters wit...
6,inproceedings,Improving Human Annotation Effectiveness for F...,,Association for Computational Linguistics,"[Kamath, Pranav, Sun, Yiwen, Semere, Thomas, G...",Identifying and integrating missing facts is a...
7,inproceedings,Collective Entity Disambiguation with Structur...,,Association for Computational Linguistics,"[Yang, Yi, Irsoy, Ozan, Rahman, Kazi Shefaet]",We present a gradient-tree-boosting-based stru...
8,inproceedings,Likelihood Ratio-based Forensic Voice Comparis...,,,"[Frost, Daniel, Ishihara, Shunichi]",
9,inproceedings,Using Core Ontology for Domain Lexicon Structu...,,European Language Resources Association (ELRA),"[Marinelli, Rita, Roventini, Adriana, Spadoni,...",The users demand has determined the need to m...


In [17]:
# Lets delete the column 'editor' for the testing set as well.
# Delete editor column

print(test)

column_to_remove = 'editor'
test = test.drop(column_to_remove, axis=1)

pd.DataFrame(test)  

# You forgot to delete the abstract column for the test set.

           ENTRYTYPE                                              title  \
0      inproceedings          Learning to lemmatise Polish noun phrases   
1      inproceedings  The Treebanked Conspiracy. Actors and Actions ...   
2      inproceedings       Linguistic structure and machine translation   
3      inproceedings  NSEmo at EmoInt-2017: An Ensemble to Predict E...   
4      inproceedings     Explaining data using causal Bayesian networks   
...              ...                                                ...   
21967  inproceedings  Scalable Font Reconstruction with Dual Latent ...   
21968  inproceedings  UniGeo: Unifying Geometry Logical Reasoning vi...   
21969  inproceedings  Gradient-guided Unsupervised Lexically Constra...   
21970  inproceedings  Semantically Constrained Multilayer Annotation...   
21971  inproceedings  Annotating Students' Understanding of Science ...   

      editor                                       publisher  \
0                  Association for 

Unnamed: 0,ENTRYTYPE,title,publisher,author,abstract
0,inproceedings,Learning to lemmatise Polish noun phrases,Association for Computational Linguistics,"[Radziszewski, Adam]",
1,inproceedings,The Treebanked Conspiracy. Actors and Actions ...,,"[Passarotti, Marco, González Saavedra, Berta]",
2,inproceedings,Linguistic structure and machine translation,,"[Lamb, Sydney M.]",If one understands the nature of linguistic st...
3,inproceedings,NSEmo at EmoInt-2017: An Ensemble to Predict E...,Association for Computational Linguistics,"[Madisetty, Sreekanth, Desarkar, Maunendra San...","In this paper, we describe a method to predict..."
4,inproceedings,Explaining data using causal Bayesian networks,Association for Computational Linguistics,"[Sevilla, Jaime]",I introduce Causal Bayesian Networks as a form...
...,...,...,...,...,...
21967,inproceedings,Scalable Font Reconstruction with Dual Latent ...,Association for Computational Linguistics,"[Srivatsan, Nikita, Wu, Si, Barron, Jonathan, ...",We propose a deep generative model that perfor...
21968,inproceedings,UniGeo: Unifying Geometry Logical Reasoning vi...,Association for Computational Linguistics,"[Chen, Jiaqi, Li, Tong, Qin, Jinghui, Lu, Pan,...",Geometry problem solving is a well-recognized ...
21969,inproceedings,Gradient-guided Unsupervised Lexically Constra...,Association for Computational Linguistics,"[Sha, Lei]",Lexically constrained generation requires the ...
21970,inproceedings,Semantically Constrained Multilayer Annotation...,Association for Computational Linguistics,"[Prange, Jakob, Schneider, Nathan, Abend, Omri]",We propose a coreference annotation scheme as ...


In [18]:
# Now we delete the 'abstract' column in the test set.

print(test)

column_to_remove = 'abstract'
test = test.drop(column_to_remove, axis=1)

pd.DataFrame(test)  


           ENTRYTYPE                                              title  \
0      inproceedings          Learning to lemmatise Polish noun phrases   
1      inproceedings  The Treebanked Conspiracy. Actors and Actions ...   
2      inproceedings       Linguistic structure and machine translation   
3      inproceedings  NSEmo at EmoInt-2017: An Ensemble to Predict E...   
4      inproceedings     Explaining data using causal Bayesian networks   
...              ...                                                ...   
21967  inproceedings  Scalable Font Reconstruction with Dual Latent ...   
21968  inproceedings  UniGeo: Unifying Geometry Logical Reasoning vi...   
21969  inproceedings  Gradient-guided Unsupervised Lexically Constra...   
21970  inproceedings  Semantically Constrained Multilayer Annotation...   
21971  inproceedings  Annotating Students' Understanding of Science ...   

                                            publisher  \
0           Association for Computational 

Unnamed: 0,ENTRYTYPE,title,publisher,author
0,inproceedings,Learning to lemmatise Polish noun phrases,Association for Computational Linguistics,"[Radziszewski, Adam]"
1,inproceedings,The Treebanked Conspiracy. Actors and Actions ...,,"[Passarotti, Marco, González Saavedra, Berta]"
2,inproceedings,Linguistic structure and machine translation,,"[Lamb, Sydney M.]"
3,inproceedings,NSEmo at EmoInt-2017: An Ensemble to Predict E...,Association for Computational Linguistics,"[Madisetty, Sreekanth, Desarkar, Maunendra San..."
4,inproceedings,Explaining data using causal Bayesian networks,Association for Computational Linguistics,"[Sevilla, Jaime]"
...,...,...,...,...
21967,inproceedings,Scalable Font Reconstruction with Dual Latent ...,Association for Computational Linguistics,"[Srivatsan, Nikita, Wu, Si, Barron, Jonathan, ..."
21968,inproceedings,UniGeo: Unifying Geometry Logical Reasoning vi...,Association for Computational Linguistics,"[Chen, Jiaqi, Li, Tong, Qin, Jinghui, Lu, Pan,..."
21969,inproceedings,Gradient-guided Unsupervised Lexically Constra...,Association for Computational Linguistics,"[Sha, Lei]"
21970,inproceedings,Semantically Constrained Multilayer Annotation...,Association for Computational Linguistics,"[Prange, Jakob, Schneider, Nathan, Abend, Omri]"


In [19]:
# Validation set
from sklearn.model_selection import train_test_split


train, validation = train_test_split(train, test_size=0.33, random_state=42)       # We create a validation set and see it on a pandas data frame
pd.DataFrame(validation)


Unnamed: 0,ENTRYTYPE,title,year,publisher,author
12680,inproceedings,Question-Answering Based on Virtually Integrat...,2003,Association for Computational Linguistics,"[Choi, Key-Sun, Kim, Jae-Ho, Miyazaki, Masaru,..."
17292,inproceedings,AMI&ERIC: How to Learn with Naive Bayes and Pr...,2013,Association for Computational Linguistics,"[Dermouche, Mohamed, Khouas, Leila, Velcin, Ju..."
33265,inproceedings,Inducing Gazetteers for Named Entity Recogniti...,2008,Association for Computational Linguistics,"[Kazama, Jun'ichi, Torisawa, Kentaro]"
52850,inproceedings,Leveraging Explicit Lexico-logical Alignments ...,2022,Association for Computational Linguistics,"[Sun, Runxin, He, Shizhu, Zhu, Chong, He, Yaoh..."
2298,inproceedings,CLAM: Quickly deploy NLP command-line tools on...,2014,Dublin City University and Association for Com...,"[van Gompel, Maarten, Reynaert, Martin]"
...,...,...,...,...,...
36482,inproceedings,Modeling Blame to Avoid Positive Face Threats ...,2014,Association for Computational Linguistics,"[Briggs, Gordon, Scheutz, Matthias]"
38156,inproceedings,Self-supervised Product Title Rewrite for Prod...,2022,Association for Computational Linguistics,"[Zhao, Xue, Liu, Dayiheng, Ding, Junwei, Yao, ..."
54470,inproceedings,Lexical Semantics in Human-Computer Communication,1984,Association for Computational Linguistics,"[Rosenberg, Jarrett]"
62249,inproceedings,Evaluation of HTR models without Ground Truth ...,2022,European Language Resources Association,"[Ströbel, Phillip Benjamin, Volk, Martin, Clem..."


In [41]:
train['title'] = train['title'].astype(str)
train['author'] = train['author'].astype(str)

validation['title'] = validation['title'].astype(str)
validation['author'] = validation['author'].astype(str)


featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title"),
                ("author", CountVectorizer(), "author"),
                ("ENTRYTYPE", CountVectorizer(), "ENTRYTYPE"),
                ("publisher", CountVectorizer(), "publisher")],
        remainder='drop')
'''ColumnTransformer(
                transformers=[("ENTRYTYPE", CountVectorizer(), "ENTRYTYPE")],
                remainder='drop') 
featurizer = ColumnTransformer(
        transformers=[("ENTRYTYPE", CountVectorizer(), "ENTRYTYPE")],
        remainder='drop')
featurizer = ColumnTransformer(
        transformers=[("publisher", CountVectorizer(), "publisher")],
        remainder='drop')
featurizer = ColumnTransformer(
        transformers=[("author", CountVectorizer(), "author")],
        remainder='drop')'''
dummy = make_pipeline(featurizer, DummyRegressor(strategy = 'mean'))
ridge = make_pipeline(featurizer, Ridge())
logging.info("Fitting models")
dummy.fit(train.drop('year', axis=1), train['year'].values)
ridge.fit(train.drop('year', axis=1), train['year'].values)
err = mean_absolute_error(validation['year'].values, ridge.predict(validation.drop('year', axis=1)))
print(err)

4.6425969854805045


In [None]:
from sklearn.ensemble import RandomForestRegressor

featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title"),
        ("ENTRYTPE", CountVectorizer(), "ENTRYTYPE")]
        remainder='drop')
Random_Forest = make_pipeline(featurizer, RandomForestRegressor())
logging.info("Fitting models")
Random_Forest.fit(train.drop('year', axis=1), train['year'].values)
err = mean_absolute_error(validation['year'].values, Random_Forest.predict(validation.drop('year', axis=1)))
logging.info(f"Mean baseline MAE: {err}")
print(err)
pred = Random_Forest.predict(test)
test['year'] = pred


: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error
import logging

# Constants
TITLE_COLUMN = 'title'
TARGET_COLUMN = 'year'

# Feature engineering
featurizer = ColumnTransformer(
    transformers=[(TITLE_COLUMN, CountVectorizer(), TITLE_COLUMN)],
    remainder='drop'
)

# Create pipelines
dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
random_forest = make_pipeline(featurizer, RandomForestRegressor())

# Define the parameter grid for Random Forest
param_grid = {
    'randomforestregressor__n_estimators': [50, 100, 200],
    'randomforestregressor__max_depth': [None, 10, 20],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(random_forest, param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(train.drop(TARGET_COLUMN, axis=1), train[TARGET_COLUMN].values)

# Get the best model and its parameters
best_random_forest_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Log the best parameters
logging.info(f'Best Parameters: {best_params}')

# Evaluate the best model
err = mean_absolute_error(validation[TARGET_COLUMN].values, best_random_forest_model.predict(validation.drop(TARGET_COLUMN, axis=1)))
logging.info(f'Mean Absolute Error on Validation Set: {err}')


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
import logging

featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")],
        remainder='drop')
dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
Random_Forest = make_pipeline(featurizer, RandomForestRegressor())
logging.info("Fitting models")
dummy.fit(train.drop('year', axis=1), train['year'].values)
ridge.fit(train.drop('year', axis=1), train['year'].values)
err = mean_absolute_error(validation['year'].values, dummy.predict(validation.drop('year', axis=1)))
print(err)



7.8803642292039875


In [None]:
def main():
    logging.getLogger().setLevel(logging.INFO)
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('test.json'))).fillna("")
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")],
        remainder='drop')
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
    ridge = make_pipeline(featurizer, Ridge())
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)

main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812336775250446
INFO:root:Predicting on test
INFO:root:Writing prediction file
