just some notes here for now
need to make a pipeline for the incoming user inputs that:
1) turns the title into a properly formatted title
2) adds a date column that presumes an age of 0 (or 5 or whatever I decide)--could actually write this to output predictions for various ages
3) adds any other features that are necessary--sentiment analysis, other nlp stuff, etc.

In [1]:
import pandas as pd
import re
from titlecase import titlecase
from textblob import TextBlob
from textstat.textstat import textstat
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelBinarizer

In [2]:
#read in the csv for modeling
data_1500_kf = pd.read_csv("full_data/english_books_final_deduped_2_to_1500.csv", usecols=["proper_title", "age_in_2017"])
data_1500_kf.dropna(inplace=True)

In [3]:
data_1500_kf.index = range(len(data_1500_kf))

In [4]:
data_1500_kf.shape

(3831, 2)

In [5]:
x = [("Marking Identity: Maori Tattoos and Cultural History", 0)]

In [6]:
df_x = pd.DataFrame(x, columns=["title", "age_in_2017"])
df_x

Unnamed: 0,title,age_in_2017
0,Marking Identity: Maori Tattoos and Cultural H...,0


In [7]:
#turn title into proper title
def make_proper_title(string):
    string = string.replace(" : ", ": ")
    string = string.rstrip(".")
    return titlecase(string)

In [8]:
df_x["proper_title"] = df_x["title"].apply(make_proper_title)

In [9]:
#add sentiment analysis
def text_blob_sentiment_polarity(value):
    blob = TextBlob(value)
    return blob.sentiment.polarity
def text_blob_sentiment_subjectivity(value):
    blob = TextBlob(value)
    return blob.sentiment.subjectivity

In [10]:
df_x["sentiment_polarity"] = df_x["proper_title"].apply(text_blob_sentiment_polarity)


In [11]:
df_x["sentiment_subjectivity"] = df_x["proper_title"].apply(text_blob_sentiment_subjectivity)


In [12]:
#add reading level
def reading_level_comp(string):
    try:
        level = textstat.text_standard(string)
        return level
    except:
        return "Unclear"

In [13]:
df_x["reading_level"] = df_x["proper_title"].apply(reading_level_comp)

In [14]:
#need to make a dataframe for dummies based on the training set
#first get the training reading levels to use for dummies
#this is NOT the way to do this--takes too long
data_1500_kf["reading_level"] = data_1500_kf["proper_title"].apply(reading_level_comp)

In [15]:
#make reading level from training set into a dummies df
lb_rl = LabelBinarizer()
lb_rl.fit_transform(data_1500_kf["reading_level"])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
#use the lb for the input title
lb_rl_input = lb_rl.transform(df_x["reading_level"])
lb_rl_input

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]])

In [17]:
reading_level_dummies = pd.DataFrame(lb_rl_input)
reading_level_dummies.shape

(1, 49)

In [18]:
#add number of words column
df_x["number_of_words"] = df_x["proper_title"].apply(lambda x: len(x.split()))

In [19]:
#add title length
df_x["title_length"] = df_x["proper_title"].apply(lambda x: len(x))

In [20]:
#add topic modeling
#make the CV model on the training set
#NOTE: I can pickle this model for use in both the input pipeline and the model
cv_for_lda = CountVectorizer(min_df=5, max_df=.75, ngram_range=(1,3), stop_words="english")

words = cv_for_lda.fit_transform(data_1500_kf["proper_title"])

In [21]:
#do the topic modeling on the training set
#NOTE: as above re: pickling
lda_8 = LatentDirichletAllocation(n_topics=8, max_iter=15,
                                topic_word_prior=2,
                                learning_offset=50., random_state=3)

lda_8.fit(words)




LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=50.0, max_doc_update_iter=100, max_iter=15,
             mean_change_tol=0.001, n_jobs=1, n_topics=8, perp_tol=0.1,
             random_state=3, topic_word_prior=2, total_samples=1000000.0,
             verbose=0)

In [22]:
#CountVectorize the words in the input string in keeping with the topic modeling model
input_words = cv_for_lda.transform(df_x["proper_title"])


In [23]:
input_words.shape

(1, 772)

In [24]:
#transform the input string using the training set model
transformed_data_8= lda_8.transform(input_words)
transformed_data_8 = pd.DataFrame(transformed_data_8, columns=['Topic %s' % x for x in range(8)])

In [25]:
transformed_data_8

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7
0,0.015676,0.015654,0.015647,0.015646,0.489557,0.236054,0.196127,0.01564


In [26]:
def top_topic_number_extractor(dataframe):
    top_topic_list = []
    for i in dataframe.index:
        ordered_row = dataframe.iloc[i,:].sort_values(ascending=False)
        top_topic_name = ordered_row.index[0]
        count_pattern = re.compile("\d+")
        top_topic_number = count_pattern.search(top_topic_name).group()
        top_topic_list.append(int(top_topic_number))
    return top_topic_list

In [27]:
df_x["top_topic_number_lda8"] = top_topic_number_extractor(transformed_data_8)

In [28]:
df_x["top_topic_number_lda8"]

0    4
Name: top_topic_number_lda8, dtype: int64

In [29]:
topics_list = []
for i in range(0,8):
    if df_x["top_topic_number_lda8"][0] == i:
        topics_list.append(1)
    else:
        topics_list.append(0)

In [30]:
topics_list

[0, 0, 0, 0, 1, 0, 0, 0]

In [31]:
#turn topics dict into df for joining
top_topics_df = pd.DataFrame(topics_list).transpose().copy()

In [32]:
top_topics_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,0,0,1,0,0,0


In [33]:
#tfidf vectorize the words in the training set:
tfidf = TfidfVectorizer(min_df=5, max_df=.95, lowercase=False, stop_words="english", ngram_range=(1,3))
tfidf.fit_transform(data_1500_kf["proper_title"])

<3831x923 sparse matrix of type '<class 'numpy.float64'>'
	with 13247 stored elements in Compressed Sparse Row format>

In [34]:
len(tfidf.vocabulary_)

923

In [35]:
#transform the input into tfidf
tfidf_title = tfidf.transform(df_x["proper_title"])

In [36]:
#make it into a df to join onto beginning data:
tfidf_title_df = pd.DataFrame(tfidf_title.todense(), 
                  columns=tfidf.get_feature_names())

In [37]:
tfidf_title_df.shape

(1, 923)

In [38]:
#make X to match model X (using X2 here to match the final model)
X2 = pd.concat([df_x[["age_in_2017", "sentiment_polarity", "sentiment_subjectivity", "number_of_words", "title_length"]], top_topics_df, tfidf_title_df, reading_level_dummies], axis=1)


In [39]:
X2

Unnamed: 0,age_in_2017,sentiment_polarity,sentiment_subjectivity,number_of_words,title_length,0,1,2,3,4,...,39,40,41,42,43,44,45,46,47,48
0,0,0.1,0.1,7,52,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
X2.shape

(1, 985)

In [41]:
X2.to_csv("test_data/test titles/X2_Marking_Identityc_Maori_Tattoos_and_Cultural_History.csv", index=False, encoding="utf-8")