# Petition Labeling

In [1]:
# imports
import numpy as np
import pandas as pd

fname = "./Dataset_recommendations_takehome_tags.csv"
df = pd.read_csv(fname, parse_dates=True, infer_datetime_format=True)
df.sort_values(by='petition_created_at', inplace=True)
df.head()

Unnamed: 0,petition_created_at,petition_id,title,description,total_signature_count,list_of_tags,label
8887,2020-01-01T00:05:14,19876663,"""The City of Apple Valley Parks and Recreation...","""<p>The South of the River Inclusive Playgroun...",2572,"""local""",1
4951,2020-01-01T00:54:12,19876713,"""Rainforest trust: Saving our home the Rainfor...","""<p>By saving the rainforest you save your fri...",567,"""Environment""",0
168,2020-01-01T01:01:23,19876723,"""Her Excellency Queen Elizabeth II and, Govern...","""<p><br>Your Excellency, Our Queen,</p><p>We, ...",3243,"""politics""",1
11205,2020-01-01T02:27:55,19878809,"""American Psycological Association: Narcissist...","""<p>I am petitioning to make Narcissistic Abus...",642,"""Criminal Justice""",0
3859,2020-01-01T03:41:33,19880880,"""Tim Walz: Minnesota's State Board of Investme...","""<p>New York City is one of the only cities to...",1866,"""Environment""",1


## Some statistics:

Abour 63% passed the quality threshold.

In [3]:
df.describe()

Unnamed: 0,petition_id,total_signature_count,label
count,12882.0,12882.0,12882.0
mean,21006680.0,6959.238,0.628474
std,562851.8,49956.75,0.483231
min,19876660.0,500.0,0.0
25%,20675060.0,774.0,0.0
50%,20982500.0,1341.5,1.0
75%,21424450.0,3018.25,1.0
max,22125550.0,3188536.0,1.0


## Isolating tags

The `count_tags` function extracts all tages in the given data set and print the total nimber of tags

In [5]:
def count_tags(df, exclude = set()):
    tags = set()
    all_tags = list(df['list_of_tags'])
    tag_usage_counter = dict()
    cnt = 0
    for tgs in all_tags:
        tgs = tgs.lower()
        tgs = tgs.replace('"', '')
        list_tgs = set(tgs.split(', '))
        for tg in list_tgs:
            if tg in exclude:
                continue
            if tg in tag_usage_counter:
                tag_usage_counter[tg] += 1
            else:
                tag_usage_counter[tg] = 1
        tags = tags.union(list_tgs)
        cnt += 1
    print("Total number of unique tags: %d"%(len(tags)))
    return tags, tag_usage_counter
count_tags(df);

Total number of unique tags: 3660


## Petitions labeled 1

We select all those petitions that passed the quality threshold and extracts all of their tags 

In [6]:
high_quality_df = df[df['label']==1]
_, h_tag_usage_counter = count_tags(high_quality_df)
h_tag_pairs = [(k, h_tag_usage_counter[k]) for k in h_tag_usage_counter]
h_tag_pairs.sort(key=lambda x:-x[1])

Total number of unique tags: 2698


Then, we select the top 100 tags with highest frequency:

In [7]:
h_tags = set([_[0] for _ in h_tag_pairs[:100]])

## Petitions labeled 0

We select all those petitions that **did not** passed the quality threshold and extracts all of their tags 

In [8]:
low_quality_df = df[df['label']==0]
_, l_tag_usage_counter = count_tags(low_quality_df, exclude=h_tags)
l_tag_pairs = [(k, l_tag_usage_counter[k]) for k in l_tag_usage_counter]
l_tag_pairs.sort(key=lambda x:-x[1])

Total number of unique tags: 1584


Then, we select the top 100 tags with highest frequency:

In [9]:
l_tags = set([_[0] for _ in l_tag_pairs[:100]])

## Turning top tags into features:

Now, for each petition and for each high and low quality tags selected above, we add a *binary* feature whose value represents presence or absence of the tag in the given petition:

In [10]:
def include_tag(x, tg):
    tgs = x['list_of_tags'].replace('"', '')
    s_tgs = set(tgs.split(', '))
    if tg in s_tgs:
        return 1.
    return 0.
for tag in h_tags:
    df[tag] = df.apply(lambda x, tg=tag: include_tag(x, tg), axis=1)
for tag in l_tags:
    df[tag] = df.apply(lambda x, tg=tag: include_tag(x, tg), axis=1)

df.head()

Unnamed: 0,petition_created_at,petition_id,title,description,total_signature_count,list_of_tags,label,covid-19 workers' rights,progressive values,workers rights,...,parks,nursing,plastic waste,police brutality,civil rights,graduation 2020,prison reform,food security,travel,evictions
8887,2020-01-01T00:05:14,19876663,"""The City of Apple Valley Parks and Recreation...","""<p>The South of the River Inclusive Playgroun...",2572,"""local""",1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4951,2020-01-01T00:54:12,19876713,"""Rainforest trust: Saving our home the Rainfor...","""<p>By saving the rainforest you save your fri...",567,"""Environment""",0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
168,2020-01-01T01:01:23,19876723,"""Her Excellency Queen Elizabeth II and, Govern...","""<p><br>Your Excellency, Our Queen,</p><p>We, ...",3243,"""politics""",1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11205,2020-01-01T02:27:55,19878809,"""American Psycological Association: Narcissist...","""<p>I am petitioning to make Narcissistic Abus...",642,"""Criminal Justice""",0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3859,2020-01-01T03:41:33,19880880,"""Tim Walz: Minnesota's State Board of Investme...","""<p>New York City is one of the only cities to...",1866,"""Environment""",1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# The prepared data:

Then we use `label` as a target for the classification and top high/low quality tags plus the nu,ber of signatures as features to feed some machine learning classifiers.

In [11]:
features = ['total_signature_count'] + list(h_tags) + list(l_tags)
X = df[features].values
y = df['label'].values

# Gaussina Naive Bayes Classifier:

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
y_pred = clf_nb.predict(X_test)
y_prob = clf_nb.predict_proba(X_test)
print("Accuracy for Naive Bayes: %.3f"% accuracy_score(y_test, y_pred))
print("F1 for Naive Bayes: %.3f"% f1_score(y_test, y_pred))

Accuracy for Naive Bayes: 0.926
F1 for Naive Bayes: 0.938


# Logistic Regression Classifier:

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf_log = LogisticRegression()
clf_log.fit(X_train, y_train)
y_pred = clf_log.predict(X_test)
y_prob = clf_nb.predict_proba(X_test)
print("Accuracy for Logistic Regression: %.3f"% accuracy_score(y_test, y_pred))
print("F1 for Logistic Regression: %.3f"% f1_score(y_test, y_pred))
##
#preds = y_prob[:,1]
#fpr, tpr, threshold = roc_curve(y_test, preds)
#roc_auc = auc(fpr, tpr)

# method I: plt
#import matplotlib.pyplot as plt
#plt.title('Receiver Operating Characteristic')
#plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
#plt.legend(loc = 'lower right')
#plt.plot([0, 1], [0, 1],'r--')
#plt.xlim([0, 1])
#plt.ylim([0, 1])
#plt.ylabel('True Positive Rate')
#plt.xlabel('False Positive Rate')

Accuracy for Logistic Regression: 1.000
F1 for Logistic Regression: 1.000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The ~100% percent accuracy seems a bit surperising and requires more investigation to make sure that we are not overfitting data or the splitting the training/test data in a biased manner.

# Answeres to some of the question:

 **Q**: What kind of model would you use? What is your process for selection?
 
 > **A**: *Always* begin with simple models to get better insight about the complexity of the model.
 We tried Naive Bayes and Logistic Regression to classify the petitions based on their tags and number of signatures. We can explain the outcome of these two models statisticall and derive understandable decision.
 
 **Q**: What is your process for ensuring data quality?
 
> **A**: In this time constrained exercise, I tried to avoid basic mistakes like ignoring lowe case/upper case similarities. But generally one can perform a text classification to group tags in a more scientific manner.

# For more details on the solution and possible improvement see the enclosed MS Word document.