# Importing libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import random
import re
import string

import scipy.spatial
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams


from tqdm import tqdm
from pprint import pprint

import sys
import os
import glob
# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'


In [2]:
PATH = '../input/good-reads-quotes'

In [3]:
# Here all files 
for name in glob.glob(PATH + '/*'):
    print(name)

## Working with each quote label, one quote for one label

As we sow we have a lot of csv file, which is contains a quotes for exact label.



We want to make just `N-class` like sentiment, and will updata each class with some quotes which is relate to this class.

* Class `love` -> love + poetry + romance + relationships
* Class `motivation` -> life + inspirational + hope + success

* Class `wisdom` -> truth + faith + wisdom + spirituality

And make a multi-class classification model for this project

In [4]:
love_quotes = ['love' , 'poetry' , 'romance' , 'relationships']
motivation_quotes = ['hope', 'life' , 'inspirational' , 'success']
wisdom_quotes = ['wisdom', 'truth' , 'faith' , 'spirituality']


# Customize data

### Make a love class

In [5]:
PATH

In [6]:
pd.read_csv(PATH + '/quotes_of_love.csv')

In [7]:
# quotes_of_love.csv
love_list = []
for q in love_quotes:
    path = PATH + "/quotes_of_" + q + ".csv"
    name = "quotes_of_" + q
    df = pd.read_csv(path)
    li = df['quotes'].values
    for row in li:
        love_list.append(row)
    

love_list[0]

In [8]:
len(love_list)

In [9]:
# There is no none values
None in love_list

### Make a motivation class

In [10]:
motive_list = []
for q in motivation_quotes:
    path = PATH + "/quotes_of_" + q + ".csv"
    name = "quotes_of_" + q
    df = pd.read_csv(path)
    li = df['quotes'].values
    for row in li:
        motive_list.append(row)
    

motive_list[0]

In [11]:
len(motive_list)

In [12]:
# There is no none values
None in motive_list

### Make a wisdom class

In [13]:
wisdom_list = []
for q in wisdom_quotes:
    path = PATH + "/quotes_of_" + q + ".csv"
    name = "quotes_of_" + q
    df = pd.read_csv(path)
    li = df['quotes'].values
    for row in li:
        wisdom_list.append(row)
    

wisdom_list[0]

In [14]:
len(wisdom_list)

In [15]:
# There is no none values
None in wisdom_list

### Collect quotes and labels

**Make all three classes in one list**

In [16]:
quotes = love_list + motive_list + wisdom_list

In [17]:
len(quotes)

**Makes a labels for each class and collect them in one list**

In [18]:
print(len(love_list) , len(motive_list), len(wisdom_list))

**What about label?**


I can make a lable of each class, by knowing the size of each one and make a list contain all classes.



In [19]:
labels = []
love = ['love' for i in range(len(love_list))]                   ## ading love label
motivation = ['motivation' for i in range(len(motive_list))]     ## ading motivation label
wisdom = ['wisdom' for i in range(len(wisdom_list))]             ## ading motivation label
labels = love + motivation + wisdom
len(labels)

In [20]:
labels[0], labels[len(motive_list)+1], labels[-1]

In [21]:
# They have the same length
len(quotes), len(labels)

**Shuffel quotes and labes for modeling**

In [22]:
import random

a = ['a', 'b', 'c']
b = [1, 2, 3]

c = list(zip(a, b))

random.shuffle(c)

a, b = zip(*c)

print(a)
print(b)


In [23]:
shuffled_data = list(zip(quotes, labels))
random.shuffle(shuffled_data)
quotes, labels = zip(*shuffled_data)


In [24]:
len(quotes), len(labels)

In [25]:
print(quotes[0])
print(labels[0])

### Make a dataframe for exact style

In [26]:
data = pd.DataFrame({'quotes': quotes,
                     'class': labels})
data.head()

In [27]:
data.shape

# EDA

In [28]:
# Missing Values  -> there is no null values
data.isnull().sum()

In [29]:
data['class'].value_counts()

In [30]:
data['class'].value_counts(normalize = True)

In [31]:
sns.countplot(data= data, x= 'class',
             order = data['class'].value_counts().index);

**Examples of each class**

In [32]:
# Love class
print('--Love class example:--\n', data[data['class'] == 'love']['quotes'].values[0])

# Motivation class
print('--Motivation class example:--\n', data[data['class'] == 'motivation']['quotes'].values[0])

# Wisdom class
print('--Wisdom class example:--\n', data[data['class'] == 'wisdom']['quotes'].values[0])

# Text Data Preprocessing
We need to pre-process the data to get it all in a consistent format.We need to clean, tokenize and convert our data into a matrix. Let's create a function which will perform the following tasks on the text columns:

* Tokenizes
* Make text lowercase
* Removes hyperlinks
* Remove punctuation
* Removes numbers
* Removes useless words "stopwords"
* Stemming/Lemmatization


In [33]:
nltk.download('stopwords')

In [34]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [35]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    # text = re.findall('“([^"]*)”', text)[0] # extract text for quotations
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [36]:
def preprocess_data(text):
    # text = remove_quotations(text)                                            # extract text for quotations
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [37]:
def remove_quotations(data):
    res = []
    for row in data:
        if re.findall('“([^"]*)”', row):
            res.append(re.findall('“([^"]*)”', row)[0])
    return res

In [38]:
data['clean_text'] = data['quotes'].apply(preprocess_data)
data.head()

**Convert sentiment to numerical variable**


In [39]:
data['label'] = data['class'].map({'love': 0,
                                'motivation': 1,
                                'wisdom': 2})
data.head()

# Analyzing Text Statistics

We can now do some statistical analysis to explore the data like:

* Text length analysis.
    * length for whole sentence, # of each character in the sentence.
    * count # of word in each sentence.
* word frequency analysis

In [40]:
data['text_n_chars'] = data.clean_text.apply(len) # count all chars in each sentence
data['text_n_words'] = data.clean_text.apply(lambda sent: len(sent.split())) # count number of words in each sentence
data.head()

In [41]:
max(data['text_n_words']), min(data['text_n_words'])

In [42]:
data['text_n_words'].value_counts()

**The distribution of number of words for each class.**

In [43]:
sns.histplot(data= data, x= 'text_n_words', hue= 'class', multiple= 'stack');

We will make each sent > 100 = 100 

In [44]:
data['text_n_words'] = data['text_n_words'].apply(lambda x : 100 if x > 100 else x)

In [45]:
max(data['text_n_words']), min(data['text_n_words'])

In [46]:
sns.histplot(data= data, x= 'text_n_words', hue= 'class', multiple= 'stack');

## Most frequent words

In [47]:
from collections import Counter

In [48]:
def freq_words(text, c, num):
    '''
        take the whole data, and return data which is have # of words in each sentiment has been passed
    '''
    words = [word for sent in text[text['class'] == c]['clean_text'] for word in sent.split()]
    freq_words = Counter(words)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:num], columns=['word', 'counts'])
    return freq_words_df

In [49]:
def plot_freq(data, st):
    '''
        take the data, and st refeere to kind of sentiment
    '''
    plt.figure(figsize=(12, 6))
    sns.barplot(data= data , x= 'counts', y= 'word')
    plt.title(f'Top 20 words in {st} quotes')
    plt.show();

### Frequent words for each love class

In [50]:
love_words = freq_words(data, 'love', 20)
love_words.T

In [51]:
plot_freq(love_words, 'love')

### Frequent words for each motivation class

In [52]:
motivation_words = freq_words(data, 'motivation', 20)
motivation_words.T

In [53]:
plot_freq(motivation_words, 'motivation')

### Frequent words for each wisdom class

In [54]:
wisdom_words = freq_words(data, 'wisdom', 20)
wisdom_words.T

In [55]:
plot_freq(wisdom_words, 'wisdom')

## Distribution of top n-grams

In [56]:
def get_top_n_gram(corpus, c,  n_gram, top_n=None):
    
    # list of splited senteces, which is just list of words
    text = [word for sent in corpus[corpus['class'] == c]['clean_text'] for word in sent.split()]

    grams = ngrams(text, n_gram)
    grams = (' '.join(g) for g in grams)
    num_of_grams = [words for words in grams]
    freq_words = Counter(num_of_grams)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:top_n], columns=['word', 'counts'])
    return freq_words_df[:top_n]

### Two-grams for love quotes

In [57]:
love_2_gram = get_top_n_gram(data, 'love', 2, 20)
love_2_gram.T

In [58]:
plot_freq(love_2_gram, 'love')

### Two-grams for motivation quotes

In [59]:
motivation_2_gram = get_top_n_gram(data, 'motivation', 2, 20)
motivation_2_gram.T

In [60]:
plot_freq(love_2_gram, 'motivation')

### Two-grams for wisdom quotes

In [61]:
wisdom_2_gram = get_top_n_gram(data, 'wisdom', 2, 20)
wisdom_2_gram.T

In [62]:
plot_freq(wisdom_2_gram, 'wisdom')

**We can easily make tri-grams for sentiment using this function `get_top_n_gram` by passing `n_gram = 3`**

## Word Cloud

In [63]:
# getting list of love quotes
love_text_clean = data[data['class' ] == 'love']['clean_text']
love_clean_words = [word for words in love_text_clean for word in words.split()]

In [64]:
# getting list of motivation quotes
motivation_text_clean = data[data['class' ] == 'motivation']['clean_text']
motivation_clean_words = [word for words in motivation_text_clean for word in words.split()]


In [65]:
# getting list of wisdom quotes
wisdom_text_clean = data[data['class' ] == 'wisdom']['clean_text']
wisdom_clean_words = [word for words in wisdom_text_clean for word in words.split()]


In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(love_clean_words))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('love quotes',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(motivation_clean_words))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('motivation quotes',fontsize=40);

wordcloud3 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(wisdom_clean_words))
ax3.imshow(wordcloud3)
ax3.axis('off')
ax3.set_title('wisdom quotes',fontsize=40);

# Working with one quote for muli-label

In [None]:
quotes_multi_label_data = pd.read_csv(PATH + '/popular_quotes.csv')
quotes_multi_label_data.head()

In [None]:
print(quotes_multi_label_data['tags'][0])
print(type(quotes_multi_label_data['tags'][0]))
print(quotes_multi_label_data['tags'][0][0]) # print char instate of str as a tag word

In [None]:
# make a preprocessing pipeline 
quotes_multi_label_data['clean_text'] = quotes_multi_label_data['quotes'].apply(preprocess_data)

# remove ' and , from the string and [] and spliting the tags
quotes_multi_label_data['tags'] = quotes_multi_label_data['tags'].apply(lambda tags: tags.replace("'","").replace(",","")[1:-1].split())

# get the len of number of tags in each quote
quotes_multi_label_data['n_tags'] = quotes_multi_label_data['tags'].apply(lambda tags: len(tags))
quotes_multi_label_data.head()

In [None]:
print(quotes_multi_label_data['tags'][0])
print(type(quotes_multi_label_data['tags'][0]))
print(quotes_multi_label_data['tags'][0][0]) # print char instate of str as a tag word

Here we converted the list as a string to actual list type

### Distribution on number of tags

In [None]:
sns.histplot(data= quotes_multi_label_data, x= 'n_tags');

### Make a list of frequent tags

In [None]:
tag_list = [word for sent in quotes_multi_label_data['tags'] for word in sent]

In [None]:
freq_tags = Counter(tag_list)
freq_tags_sorted = sorted(freq_tags.items(), key=lambda pair: pair[1], reverse=True)
freq_tags_df = pd.DataFrame(freq_tags_sorted[:20], columns=['word', 'counts'])
freq_tags_df.T

### Top most tags quotes 

In [None]:
plot_freq(freq_tags_df, 'Most_tags')

In [None]:
wordcloud = WordCloud( background_color='white', max_words= 50).generate(" ".join(tag_list))
plt.title('Most tags of quotes',fontsize=30)
plt.imshow(wordcloud)
plt.axis("off")
plt.show();