In [20]:
import pandas as pd
import numpy as np
import datetime
import string
import pandas_profiling

import datetime
from textblob import TextBlob #for polarity and sentiment analysis
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
import swifter
from pprint import pprint

import gensim
from gensim.summarization import summarize, keywords
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alice\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alice\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Related to Time Features
def trunc_at(s, d,n):
    "Returns string truncated at the n'th occurrence of the delimiter, d."
    return d.join(s.split(d, n)[:n])
def convert_date(date):
    '''First uses trunc_at to get rid of time of day the article was posted. Then it turns the remaining date into
       integers from 0-6 where 0 represents Sunday.
    '''
    strip_date = trunc_at(date,",",2)
    dummy_date = datetime.datetime.strptime(strip_date, '%b %d, %Y').strftime('%w')
    return int(dummy_date)

#Related to Weekend Dummy Variable Analysis
def weekend_or_not(day):
    '''Returns whether or not an article was published on a weekend or weekday. Day is an argument passed in
       with integers 0-6 where 0 represents Sunday.
    '''
    #0 represents sunday and 6 represents saturday
    if day not in (0,6):
        return "weekday"
    else:
        return "weekend"
    
#Related to Month Dummy Variable Analysis
def new_convert_date(date):
    '''First uses trunc_at to get rid of time of day the article was posted. Then it turns the remaining date into
       integers from 0-6 where 0 represents Sunday.
    '''
    strip_date = trunc_at(date,",",2)
    dummy_date = datetime.datetime.strptime(strip_date, '%b %d, %Y').strftime('%b')
    return dummy_date

#Related to Polarity Analysis
def find_polarity(words):
    '''Returns the polarity of every word in an article where the argument words are the words in an article.
    '''
    polarity_words = [TextBlob(word).sentiment.polarity for word in words]
    return polarity_words
def find_pos_words(polarity_words):
    '''Returns the polarities of all the positive words in an article with the polarity of all words in an 
       article as function argument. Postive word is defined as a word with a polarity greater than 0.
    '''
    pos_words = [word for word in polarity_words if word > 0] 
    return pos_words
def find_neg_words(polarity_words):
    '''Returns the polarities of all the negative words in an article with the polarity of all words in an 
       article as function argument. Negative word is defined as a word with a polarity less than 0.
    '''
    neg_words = [word for word in polarity_words if word < 0]
    return neg_words
def num_neu_words(polarity_words):
    '''Returns the number of neutral words in an article with the polarity of all word in an article as function
       argument. Neutral word id defined as a word with a polarity equal to 0.
    '''
    neu_words = [word for word in polarity_words if word == 0]
    neu_num = len(neu_words)
    return neu_num



#Related to Uninque Words Analysis
stop_words = stopwords.words("english")
def tokenize(text):
    '''Returns the tokenized words in an article.
    '''
    tokenize_words = word_tokenize(text)
    tokens = [word for word in tokenize_words if word not in stop_words]
    return tokens



# Related to Unique Words Analysis
def preprocess_words(text):
    '''Returns words lowercased and without punctuation.
    '''
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped_words = [w.translate(table) for w in words]
    words = [word.lower() for word in stripped_words]
    return words
def non_stop_words(text):
    '''Returns non_stop_words in text after applying the preprocess_words function to lowercase the words and 
       remove punctuation.
    '''
    words = preprocess_words(text)
    non_stop_words = [word for word in words if word not in stop_words]
    return non_stop_words

# Related to LDA Analysis 
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    '''Lemmatize and stem words. Lemmatizing changes third person words to first person and verbs in past and 
       future tenses to present tenses. Stemming reduces words to their root form.
    '''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    '''Preprocesses text by lowercasing and tokenizing words through gensim.utils.simple_preprocess and 
       removing stop words and only keeping words greater than three characters.
       Then applies the lemmatize_stemming function to lemmatize and stem words.
    '''
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def find_probs(prob_list,num):
    '''Takes in a 2d list with the probabilities for each topic per document with index for topic. Returns 
       just a list of probabilities in order of topics per document.
       num parameter = number of topics for lda model.
    '''
    probs =[prob_list[i][1] for i in range(num)]
    return probs


#Related to TimeDelta Analysis
def timedelta(time,run_date):
    strip_date = trunc_at(time,",",2)
    date = datetime.datetime.strptime(strip_date, '%b %d, %Y')
    timedelta_days = (run_date - date).days
    return timedelta_days


In [5]:
df = pd.read_csv("final_data.csv")

In [9]:
df = df.drop(['Unnamed: 0'], axis=1)

In [10]:
time = df["time"] #time of publication for every article

In [12]:
#Timedelta Feature 
time_run = 'Dec 1, 2020'
run_date = datetime.datetime.strptime(time_run,'%b %d, %Y')
df["timedelta"] = time.apply(lambda x: timedelta(x,run_date))

In [24]:
text = df["text"] 
df['num_keywords'] = [len(keywords(article)) for article in text] #number of keywords

In [25]:
for i in df.columns:
    print(df[i].isnull().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [15]:
df.dropna(how='any', inplace=True)

In [26]:
len(df)

7446

In [27]:
df.to_csv("final_7k.csv")

In [28]:
df.columns

Index(['link', 'title', 'text', 'view', 'topic', 'time', 'kw_min_min',
       'kw_min_max', 'kw_min_avg', 'kw_max_min', 'kw_max_max', 'kw_max_avg',
       'kw_avg_min', 'kw_avg_max', 'kw_avg_avg', 'Business', 'Innovation',
       'Leadership', 'Lifestyle', 'Money', 'month', 'Apr', 'Aug', 'Dec', 'Feb',
       'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep',
       'n_tokens_title:', 'n_tokens_content', 'n_unique_tokens',
       'average_token_length', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'day_of_week', 'friday', 'monday', 'saturday', 'sunday', 'thursday',
       'tuesday', 'wednesday', 'weekend_or_weekday', 'weekday', 'weekend',
       'global_sentiment_polarity', 'global_subjectivity',
       'title_sentiment_polarity', 'abs_title_sentiment_polarity',
       'title_subjectivity', 'abs_title_subjectivity',
       'global_rate_positive_words', 'global_rate_negative_words',
       'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity',
       'min_p

In [None]:
df.replace('', np.nan, inplace=True)

# select features for summary stats
profile = pandas_profiling.ProfileReport(df[['link', 'title', 'text', 'view', 'Business', 'Innovation', 'Leadership',
       'Lifestyle', 'Money', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun',
       'Mar', 'May', 'Nov', 'Oct', 'Sep', 'n_tokens_title:',
       'n_tokens_content', 'n_unique_tokens', 'average_token_length',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'friday', 'monday',
       'saturday', 'sunday', 'thursday', 'tuesday', 'wednesday', 'weekday',
       'weekend', 'global_sentiment_polarity', 'global_subjectivity',
       'title_sentiment_polarity', 'abs_title_sentiment_polarity',
       'title_subjectivity', 'abs_title_subjectivity',
       'global_rate_positive_words', 'global_rate_negative_words',
       'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity',
       'min_positive_polarity', 'max_positive_polarity',
       'avg_negative_polarity', 'min_negative_polarity',
       'max_negative_polarity', 'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03',
       'LDA_04', 'timedelta', 'kw_min_min', 'kw_min_max', 'kw_min_avg',
       'kw_max_min', 'kw_max_max', 'kw_max_avg', 'kw_avg_min', 'kw_avg_max',
       'kw_avg_avg', 'num_keywords']]) 
profile.to_notebook_iframe()

Summarize dataset:  92%|███████████████████████████████████████▎   | 76/83 [00:24<00:07,  1.03s/it, Get scatter matrix]