In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem.porter import *
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('data/Lab7_TextAnalysisChristmasSongsFull.csv')

In [3]:
def lemmatize_stemming(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = ''
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result += ' ' + (lemmatize_stemming(token))
    return result

df['mod_text'] = df['text'].map(preprocess)

In [4]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_df(text_column):
    x = text_column.map(analyzer.polarity_scores)
    y = [[x[i][j] for i in range(len(x))] for j in ['neg', 'neu', 'pos', 'compound']]
    return y
df['neg'], df['neu'], df['pos'], df['compound'] = sentiment_df(df['text'])

In [7]:
df.head()

Unnamed: 0,artist,song,text,songID,Popular,mod_text,neg,neu,pos,compound
0,ABBA,We Wish You A Merry Christmas,We wish you a merry Christmas \r\r\nWe wish y...,1,0,wish merri christma wish merri christma wish ...,0.0,0.582,0.418,0.9979
1,Air Supply,Sleigh Ride,Just hear those sleigh bells jingling ring tin...,2,1,hear sleigh bell jingl ring ting tingl come l...,0.036,0.751,0.213,0.9956
2,Alabama,Christmas In Dixie,"By now in New York City, there's snow on the g...",3,0,new york citi snow grind california sunshin f...,0.013,0.801,0.186,0.969
3,Alabama,Christmas In Your Arms,All my friends are asking me where I plan to s...,4,0,friend ask plan spend holiday peopl celebr se...,0.081,0.719,0.2,0.9758
4,Alabama,Christmas Is Love,It's that time of year when the whole world is...,5,0,time year world heart tot heart feel love chr...,0.0,0.559,0.441,0.9986


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df[['mod_text', 'neg', 'neu', 'pos', 'compound']], df['Popular'], test_size=0.25, random_state=181)

In [41]:
len(X_train)

791

In [42]:
len(y_train)

791

1.1

In [None]:
tfidf = TfidfVectorizer()
log = LogisticRegression(solver='lbfgs', max_iter=1000)
params = {'tfidf__ngram_range': [(1,1),(1,2)], 'log__C':np.logspace(-3,5,1000)}
pipe = Pipeline(steps=[('tfidf',tfidf),('log',log)])
cv = GridSearchCV(pipe, params, cv=5, verbose=1, n_jobs=-1)
cv.fit(X_train['mod_text'], y_train)

1.2

In [None]:
tfidf = TfidfVectorizer()
lda = LatentDirichletAllocation()
log = LogisticRegression(solver='lbfgs', max_iter=1000)
params = {'tfidf__ngram_range': [(1,1),(1,2)], 'lda__n_components':range(5,30), 'log__C':np.logspace(-3,5,1000)}
pipe = Pipeline(steps=[('tfidf',tfidf),('lda',lda),('log',log)])
cv2 = GridSearchCV(pipe, params, cv=5, verbose=1, n_jobs=-1)
cv2.fit(X_train['mod_text'], y_train)

1.3

In [None]:
log = LogisticRegression(solver='lbfgs', max_iter=1000)
params = {'log__C':np.logspace(-3,5,1000)}
pipe = Pipeline(steps=[('log',log)])
cv3 = GridSearchCV(pipe, params, cv=5, verbose=1, n_jobs=-1)
cv3.fit(X_train[['neg', 'neu', 'pos', 'compound']], y_train)

2.1

In [None]:
gridsearch_params = [
(C, ngram_range, min_df, max_df, n_components)
for C in np.logspace(-3,5,1000)
for ngram_range in [(1,1),(1,2),(1,3)]
for min_df in [0.1,0.2]
for max_df in [0.8,0.9,1.0]
for n_components in range(5,30)
]

kf = KFold(n_splits=5)

params = {
'C': None,
'ngram_range': None,
'min_df': None,
'max_df': None,
'n_components': None
}

best_cv_score_21 = 0
best_cv_score_std_21 = None
best_model_21 = None
best_parameters_21 = None
param = 0
for C, ngram_range, min_df, max_df, n_components in gridsearch_params:
    cv_scores = []
    param +=1
    print('param', param)
    fold = 0

    params['C'] = C
    params['ngram_range'] = ngram_range
    params['min_df'] = min_df
    params['max_df'] = max_df
    params['n_components'] = n_components
    
    for train_index, test_index in kf.split(X_train):
        fold +=1
        print('fold', fold)

        tfidf = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df)
        temp = tfidf.fit_transform(X_train['mod_text'].iloc[train_index])
        dense = temp.todense().tolist()
        feature_names = tfidf.get_feature_names()
        X_1_tr = pd.DataFrame(dense, columns=feature_names)
        temp = tfidf.transform(X_train['mod_text'].iloc[test_index])
        dense = temp.todense().tolist()
        X_1_te = pd.DataFrame(dense, columns=feature_names)
        
        lda = LatentDirichletAllocation(n_components=n_components, n_jobs=-1)
        temp = lda.fit_transform(X_1_tr)
        feature_names = ['topic {}'.format(i) for i in range(1, n_components +1)]
        X_2_tr = pd.DataFrame(temp, columns=feature_names)
        temp = lda.transform(X_1_te)
        X_2_te = pd.DataFrame(temp, columns=feature_names)

        
        X_cv_train = pd.concat([X_1_tr, X_2_tr], axis=1)
        X_cv_test = pd.concat([X_1_te, X_2_te], axis=1)
        y_cv_train = df['Popular'].iloc[train_index]
        y_cv_test = df['Popular'].iloc[test_index]

        log = LogisticRegression(C=C, solver='lbfgs', n_jobs=-1)
        log.fit(X_cv_train, y_cv_train)

        cv_scores.append(log.score(X_cv_test, y_cv_test))

    if np.mean(cv_scores) > best_cv_score_21:
        best_cv_score_21 = np.mean(cv_scores)
        best_cv_score_std_21 = np.std(cv_scores)
        best_parameters_21 = params
        best_model_21 = [tfidf, lda, log]

    print('score', np.mean(cv_scores))
    print('params', params)

2.2

In [None]:
gridsearch_params = [
(C, ngram_range, min_df, max_df)
for C in np.logspace(-3,5,1000)
for ngram_range in [(1,1),(1,2),(1,3)]
for min_df in [0.1,0.2]
for max_df in [0.8,0.9,1.0]
]

kf = KFold(n_splits=5)

params = {
'C': None,
'ngram_range': None,
'min_df': None,
'max_df': None,
}

best_cv_score_22 = 0
best_cv_score_std_22 = None
best_model_22 = None
best_parameters_22 = None
param = 0
for C, ngram_range, min_df, max_df in gridsearch_params:
    cv_scores = []
    param +=1
    print('param', param)
    fold = 0

    params['C'] = C
    params['ngram_range'] = ngram_range
    params['min_df'] = min_df
    params['max_df'] = max_df
    
    for train_index, test_index in kf.split(X_train):
        fold +=1
        print('fold', fold)

        tfidf = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df)
        temp = tfidf.fit_transform(X_train['mod_text'].iloc[train_index])
        dense = temp.todense().tolist()
        feature_names = tfidf.get_feature_names()
        X_1_tr = pd.DataFrame(dense, columns=feature_names)
        temp = tfidf.transform(X_train['mod_text'].iloc[test_index])
        dense = temp.todense().tolist()
        X_1_te = pd.DataFrame(dense, columns=feature_names)
        
        X_3_tr = X_train[['neg', 'neu', 'pos', 'compound']].iloc[train_index]
        X_3_tr = X_3_tr.reset_index(drop=True)
        X_3_te = X_train[['neg', 'neu', 'pos', 'compound']].iloc[test_index]
        X_3_te = X_3_te.reset_index(drop=True)
        
        X_cv_train = pd.concat([X_1_tr, X_3_tr], axis=1)
        X_cv_test = pd.concat([X_1_te, X_3_te], axis=1)
        y_cv_train = df['Popular'].iloc[train_index]
        y_cv_test = df['Popular'].iloc[test_index]

        log = LogisticRegression(C=C, solver='lbfgs', n_jobs=-1)
        log.fit(X_cv_train, y_cv_train)

        cv_scores.append(log.score(X_cv_test, y_cv_test))

    if np.mean(cv_scores) > best_cv_score_22:
        best_cv_score_22 = np.mean(cv_scores)
        best_cv_score_std_22 = np.std(cv_scores)
        best_parameters_22 = params
        best_model_22 = [tfidf, log]

    print('score', np.mean(cv_scores))
    print('params', params)

2.3

In [None]:
gridsearch_params = [
(C, ngram_range, min_df, max_df, n_components)
for C in np.logspace(-3,5,1000)
for ngram_range in [(1,1),(1,2),(1,3)]
for min_df in [0.1,0.2]
for max_df in [0.8,0.9,1.0]
for n_components in range(5,30)
]

kf = KFold(n_splits=5)

params = {
'C': None,
'ngram_range': None,
'min_df': None,
'max_df': None,
'n_components': None
}

best_cv_score_23 = 0
best_cv_score_std_23 = None
best_model_23 = None
best_parameters_23 = None
param = 0
for C, ngram_range, min_df, max_df, n_components in gridsearch_params:
    cv_scores = []
    param +=1
    print('param', param)
    fold = 0

    params['C'] = C
    params['ngram_range'] = ngram_range
    params['min_df'] = min_df
    params['max_df'] = max_df
    params['n_components'] = n_components
    
    for train_index, test_index in kf.split(X_train):
        fold +=1
        print('fold', fold)

        tfidf = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df)
        temp = tfidf.fit_transform(X_train['mod_text'].iloc[train_index])
        dense = temp.todense().tolist()
        feature_names = tfidf.get_feature_names()
        X_1_tr = pd.DataFrame(dense, columns=feature_names)
        temp = tfidf.transform(X_train['mod_text'].iloc[test_index])
        dense = temp.todense().tolist()
        X_1_te = pd.DataFrame(dense, columns=feature_names)
        
        lda = LatentDirichletAllocation(n_components=n_components, n_jobs=-1)
        temp = lda.fit_transform(X_1_tr)
        feature_names = ['topic {}'.format(i) for i in range(1, n_components +1)]
        X_2_tr = pd.DataFrame(temp, columns=feature_names)
        temp = lda.transform(X_1_te)
        X_2_te = pd.DataFrame(temp, columns=feature_names)

        
        X_3_tr = X_train[['neg', 'neu', 'pos', 'compound']].iloc[train_index]
        X_3_tr = X_3_tr.reset_index(drop=True)
        X_3_te = X_train[['neg', 'neu', 'pos', 'compound']].iloc[test_index]
        X_3_te = X_3_te.reset_index(drop=True)
        
        X_cv_train = pd.concat([X_2_tr, X_3_tr], axis=1)
        X_cv_test = pd.concat([X_2_te, X_3_te], axis=1)
        y_cv_train = df['Popular'].iloc[train_index]
        y_cv_test = df['Popular'].iloc[test_index]

        log = LogisticRegression(C=C, solver='lbfgs', n_jobs=-1)
        log.fit(X_cv_train, y_cv_train)

        cv_scores.append(log.score(X_cv_test, y_cv_test))

    if np.mean(cv_scores) > best_cv_score_23:
        best_cv_score_23 = np.mean(cv_scores)
        best_cv_score_std_23 = np.std(cv_scores)
        best_parameters_23 = params
        best_model_23 = [tfidf, lda, log]

    print('score', np.mean(cv_scores))
    print('params', params)

3.1

In [None]:
gridsearch_params = [
(C, ngram_range, min_df, max_df, n_components)
for C in np.logspace(-3,5,1000)
for ngram_range in [(1,1),(1,2),(1,3)]
for min_df in [0.1,0.2]
for max_df in [0.8,0.9,1.0]
for n_components in range(5,30)
]

kf = KFold(n_splits=5)

params = {
'C': None,
'ngram_range': None,
'min_df': None,
'max_df': None,
'n_components': None
}

best_cv_score_31 = 0
best_cv_score_std_31 = None
best_model_31 = None
best_parameters_31 = None
param = 0
for C, ngram_range, min_df, max_df, n_components in gridsearch_params:
    cv_scores = []
    param +=1
    print('param', param)
    fold = 0

    params['C'] = C
    params['ngram_range'] = ngram_range
    params['min_df'] = min_df
    params['max_df'] = max_df
    params['n_components'] = n_components
    
    for train_index, test_index in kf.split(df):
        fold +=1
        print('fold', fold)

        tfidf = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df)
        temp = tfidf.fit_transform(df['mod_text'].iloc[train_index])
        dense = temp.todense().tolist()
        feature_names = tfidf.get_feature_names()
        X_1_tr = pd.DataFrame(dense, columns=feature_names)
        temp = tfidf.transform(df['mod_text'].iloc[test_index])
        dense = temp.todense().tolist()
        X_1_te = pd.DataFrame(dense, columns=feature_names)
        
        lda = LatentDirichletAllocation(n_components=n_components, n_jobs=-1)
        temp = lda.fit_transform(X_1_tr)
        feature_names = ['topic {}'.format(i) for i in range(1, n_components +1)]
        X_2_tr = pd.DataFrame(temp, columns=feature_names)
        temp = lda.transform(X_1_te)
        X_2_te = pd.DataFrame(temp, columns=feature_names)

        
        X_3_tr = df[['neg', 'neu', 'pos', 'compound']].iloc[train_index]
        X_3_tr = X_3_tr.reset_index(drop=True)
        X_3_te = df[['neg', 'neu', 'pos', 'compound']].iloc[test_index]
        X_3_te = X_3_te.reset_index(drop=True)
        
        X_cv_train = pd.concat([X_1_tr, X_2_tr, X_3_tr], axis=1)
        X_cv_test = pd.concat([X_1_te, X_2_te, X_3_te], axis=1)
        y_cv_train = df['Popular'].iloc[train_index]
        y_cv_test = df['Popular'].iloc[test_index]

        log = LogisticRegression(C=C, solver='lbfgs', n_jobs=-1)
        log.fit(X_cv_train, y_cv_train)

        cv_scores.append(log.score(X_cv_test, y_cv_test))

    if np.mean(cv_scores) > best_cv_score_31:
        best_cv_score_31 = np.mean(cv_scores)
        best_cv_score_std_31 = np.std(cv_scores)
        best_parameters_31 = params
        best_model_31 = [tfidf, lda, log]

    print('score', np.mean(cv_scores))
    print('params', params)
