In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib

from collections import Counter

In [None]:
sns.set_style('whitegrid')
sns.set_palette("muted")
matplotlib.rc('font', size=16)

# Tasks
- visualize lenght ads ran for
    - Use to normalize impressions and clicks
- Split out gender
- Clustering (kmeans for 2-6)
    - tf-idf vectors
    - drop into 2d w/ tsne
    - make colorful pics

### Load the data

In [None]:
PATH = 'data/'
fname = 'all_ads.csv'
df = pd.read_csv(f"{PATH}{fname}", encoding='utf8')

In [None]:
df

In [None]:
df.to_csv(f"{PATH}{fname}", index=False, encoding='utf8')

# Basic Vis

#### Look at the distribution of ages targeted

In [None]:
def count_age_targets(row):
    return [*range(row['min_age'], row['max_age'] + 1)]

all_age_targets = []
for i, row in df.iterrows():
    all_age_targets += count_age_targets(row)

In [None]:
plt.hist(all_age_targets, 67)

labels = np.arange(0, 1.1, 0.25)
plt.yticks(labels * len(df), labels)
plt.xlim(8, 68)
plt.xlabel('Age')
plt.ylabel('Fraction of ads\ntargeting that age')
plt.title('Ages Targeted')
plt.show()

#### Cost-effectiveness

In [None]:
df.plot('ad_spend_rub', 'ad_impressions', kind='scatter', logx=True, logy=True, alpha=0.2)
plt.xlabel('Spend (RUB)')
plt.ylabel('Impressions')
plt.show()

In [None]:
df.plot('ad_spend_rub', 'ad_clicks', kind='scatter', logx=True, logy=True, alpha=0.2)
plt.xlabel('Spend (RUB)')
plt.ylabel('Ad Clicks')
plt.show()

In [None]:
df.plot('ad_impressions', 'ad_clicks', kind='scatter', logx=True, logy=True, alpha=0.2)
plt.xlabel('Impressions')
plt.ylabel('Ad Clicks')
plt.show()

#### Rough time-series

In [None]:
time_series = df.iloc[:, 0]
y = df['ad_clicks']

plt.scatter(time_series, y)
plt.show()

## Normalizing impressions/clicks by ad spend

In [None]:
import warnings
warnings.filterwarnings('error')

In [None]:
def safe_log(v):
    try:
        return np.log(v)
    except RuntimeWarning:
        return 0

In [None]:
def normalize_values(df, val_col, norm_col='ad_spend_rub', plot=False):
    y = df[val_col].values
    x = df[norm_col].values

    # Convert to log space
    y_log = np.array([*map(safe_log, y)])
    x_log = np.array([*map(safe_log, x)])
    
    # Do a linear fit
    fit = np.polyfit(x_log, y_log, 1)
    fit_fn = np.poly1d(fit)
    y_fit_log = fit_fn(x_log)
    
    # Get normalized results
    y_normalize_log = y_log - y_fit_log
    
    if plot:
        # Plot true data
        plt.scatter(x_log, y_log, alpha=0.2)
        plt.plot(x_log, y_fit_log)
        plt.title("Original Data")
        plt.xlabel(norm_col)
        plt.ylabel(val_col)
        plt.show()
        
        # Plot normalized data
        plt.scatter(x_log, y_normalize_log, alpha=0.2)
        plt.plot(x_log, [0] * len(x_log))
        plt.title("Normalized Data")
        plt.xlabel(norm_col)
        plt.ylabel(f"{val_col}_normalized")
        plt.show()
        
    
    return y_normalize_log

In [None]:
ad_clicks_tgt = normalize_values(df, 'ad_clicks', 'ad_spend_rub', True)
ad_impressions_tgt = normalize_values(df, 'ad_impressions', 'ad_spend_rub', True)

# Predicting ad clicks

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [None]:
# Vectorize the text
pattern = re.compile('([^\s\w]|_)+')
cleaned_text = df['ad_text'].apply(lambda v: pattern.sub('', str(v).lower())).values

vectorizer = TfidfVectorizer(max_df=0.5, stop_words='english', max_features=2000)
vecs = vectorizer.fit_transform(cleaned_text)

In [None]:
# Do some linear reg shit
X = vecs
y = ad_clicks_tgt

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = Ridge()
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [None]:
plt.scatter(y_train, y_train_pred)
plt.xlabel('y_train')
plt.ylabel('y_train_pred')

In [None]:
plt.scatter(y_test, y_test_pred)
plt.plot(np.arange(-5, 5), np.arange(-5, 5))
plt.xlabel('y_test')
plt.ylabel('y_test_pred')