In [None]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipies or reviews (as opposed to current events)
- Look at some examples

In [None]:
df_data = pd.read_csv('../../assets/datasets/train.csv', sep='\t', na_values='?')

df_data.head()


In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Fill NAs
df_data['alchemy_category'].fillna('Unknown', inplace=True)
df_data['alchemy_category_score'].fillna(0.40, inplace=True)
df_data[df_data.alchemy_category == 'Unknown']['alchemy_category_score'].value_counts()

# Feature Selection
X = df_data[[x for x in df_data.columns if x not in ['label','url','is_news','news_front_page','url_id']]]
X = pd.get_dummies(X)
y = df_data['label']


X.info()

In [53]:
from sklearn import preprocessing

# Normalizing data and getting coefeccients
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)
model = LogisticRegression()
model.fit(X_norm, y)
coeffs = pd.DataFrame(model.coef_, columns = [X.columns.values])
coeffs = coeffs.transpose()
           
coeffs.head()


Unnamed: 0,0
urlid,-0.15605
alchemy_category_score,-0.064963
avglinksize,0.010578
commonlinkratio_1,0.137689
commonlinkratio_2,0.117916


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
selector.fit(X_norm, y)

keep = pd.DataFrame(selector.get_support())
keeper = keep.transpose()
keeper.columns = [x for x in X.columns.values]
keep = keeper.transpose()

print 'Based on K best here is what I would keep...'
keep_2 = keep[keep[0] == True]

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=10, step=1)
rfe.fit(X_norm, y)

rfe_values = rfe.support_
rfe_values = X_norm.columns[rfe_values].values

print "Final Features to Keep According to RFE"
print rfe_values

In [None]:
# Feature engineering to construct new features
from sklearn.feature_extraction.text import CountVectorizer

vector_1 = CountVectorizer(
    binary=True,  
    stop_words='english', 
    max_features=50, 
)

X_1 = vector_1.fit_transform(df_data.boilerplate).todense()
df_X_1 = pd.DataFrame(X_1, columns=vector_1.get_feature_names()).head()

In [None]:
# Train a logit model

def model(features):
    X = df_data[features]
    y = df_data['label']

    logreg = LogisticRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    logreg.fit(X_train, y_train)
    
    predictions = logreg.predict(X_test)
    
    confustion_matrix = confusion_matrix(y_test, predictions)
   
    df_final = pd.DataFrame(confusion_matrix, index=['Green','Not Green'], columns=['Green Predicted', 'Green NOT Predicted'])
                                       
    print df_final
    print classification_report(y_test, predictions)


print "K_Best: ",
modeler(keep_2)

print "RFE: ",
modeler(rfe_values)