In [103]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

ROOT_PATH = Path("..")

In [104]:
df = pd.read_csv(ROOT_PATH / "data/raw/metadata.csv")
svc_ids = pd.read_json(ROOT_PATH / "data/raw/song_vs_call.json").squeeze()
svc_df = df.loc[df.id.isin(svc_ids)].copy()

with open(ROOT_PATH / "data/processed/svc_split.json") as svc_split_file:
    svc_split = json.load(svc_split_file)
    train_ids = svc_split["train_ids"]
    test_ids = svc_split["test_ids"]

In [110]:
# Add response variable
type_col = svc_df.type.str.lower().str.replace(" ", "").str.split(",")
filtered_type_col = type_col.apply(lambda l: set(l) - {"call", "song"})
svc_df["pred"] = type_col.apply(lambda l: "call" in l).astype(int)

In [111]:
# Add gender feature
def filter_gender(labels):
    if "male" in labels:
        return "male"
    elif "female" in labels:
        return "female"
    else:
        return np.nan


svc_df["gender"] = type_col.apply(filter_gender)

In [112]:
# Add age feature
def filter_age(labels):
    if "adult" in labels:
        return "adult"
    elif "juvenile" in labels:
        return "juvenile"
    else:
        return np.nan


svc_df["age"] = type_col.apply(filter_age)

In [113]:
keep_cols = [
    "id",
    "gen",
    "sp",
    "ssp",
    "en",
    "lat",
    "lng",
    "time",
    "date",
    "gender",
    "age",
]
X_df, y_df = (
    svc_df.reindex(columns=keep_cols).copy(),
    svc_df.reindex(columns=["id", "pred"]).copy(),
)
feature_mapper = DataFrameMapper(
    [
        ("id", None),
        (["gen"], OneHotEncoder()),
        (["sp"], OneHotEncoder()),
        (["ssp"], OneHotEncoder()),
        (["en"], OneHotEncoder()),
        (["lat"], [MinMaxScaler(), SimpleImputer()]),
        (["lng"], [MinMaxScaler(), SimpleImputer()]),
        (["gender"], OneHotEncoder()),
        (["age"], OneHotEncoder()),
    ],
    df_out=True,
)

In [114]:
X_feat_df = feature_mapper.fit_transform(X_df)
X_train, X_test = (
    X_feat_df[X_feat_df.id.isin(train_ids)].drop(columns=["id"]),
    X_feat_df[X_feat_df.id.isin(test_ids)].drop(columns=["id"]),
)
y_train, y_test = (
    y_df[y_df.id.isin(train_ids)].drop(columns=["id"]).squeeze(),
    y_df[y_df.id.isin(test_ids)].drop(columns=["id"]).squeeze(),
)

In [116]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [117]:
print(lr.score(X_test, y_test))

0.7240377632534495
