## Basic Cleaning Operation 

In [5]:
import pandas as pd

In [7]:
data= pd.read_csv('bw_project.csv')
data

Unnamed: 0,Author,Quote,Tag_Names
0,Albert Einstein,“The world as we have created it is a process ...,"change,deep-thoughts,thinking,world"
1,J.K. Rowling,"“It is our choices, Harry, that show what we t...","abilities,choices"
2,Albert Einstein,“There are only two ways to live your life. On...,"inspirational,life,live,miracle,miracles"
3,Jane Austen,"“The person, be it gentleman or lady, who has ...","aliteracy,books,classic,humor"
4,Marilyn Monroe,"“Imperfection is beauty, madness is genius and...","be-yourself,inspirational"
...,...,...,...
95,Harper Lee,“You never really understand a person until yo...,better-life-empathy
96,Madeleine L'Engle,“You have to write the book that wants to be w...,"books,children,difficult,grown-ups,write,write..."
97,Mark Twain,“Never tell the truth to people who are not wo...,truth
98,Dr. Seuss,"“A person's a person, no matter how small.”",inspirational


###  Data Cleaning and Preprocessing

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Author     100 non-null    object
 1   Quote      100 non-null    object
 2   Tag_Names  97 non-null     object
dtypes: object(3)
memory usage: 2.5+ KB


### Finding and TreatingNull values

In [13]:
data.isnull().sum()

Author       0
Quote        0
Tag_Names    3
dtype: int64

In [15]:
# As Null value is 3% in the dataset we cant remove it. 
# fills in missing values (NaN) with Unknown. 

data['Tag_Names'].fillna('Unknown', inplace=True) 
data['Tag_Names']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Tag_Names'].fillna('Unknown', inplace=True)


0                   change,deep-thoughts,thinking,world
1                                     abilities,choices
2              inspirational,life,live,miracle,miracles
3                         aliteracy,books,classic,humor
4                             be-yourself,inspirational
                            ...                        
95                                  better-life-empathy
96    books,children,difficult,grown-ups,write,write...
97                                                truth
98                                        inspirational
99                                           books,mind
Name: Tag_Names, Length: 100, dtype: object

In [17]:
data.info() # Null values treated successfully

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Author     100 non-null    object
 1   Quote      100 non-null    object
 2   Tag_Names  100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [19]:
data.head()

Unnamed: 0,Author,Quote,Tag_Names
0,Albert Einstein,“The world as we have created it is a process ...,"change,deep-thoughts,thinking,world"
1,J.K. Rowling,"“It is our choices, Harry, that show what we t...","abilities,choices"
2,Albert Einstein,“There are only two ways to live your life. On...,"inspirational,life,live,miracle,miracles"
3,Jane Austen,"“The person, be it gentleman or lady, who has ...","aliteracy,books,classic,humor"
4,Marilyn Monroe,"“Imperfection is beauty, madness is genius and...","be-yourself,inspirational"


### Treat ' "" ' (quotations) from Quote column 

In [22]:
data["Quote"] = data["Quote"].str.replace('“', '')

In [24]:
# Quotations treated successfully
data 

Unnamed: 0,Author,Quote,Tag_Names
0,Albert Einstein,The world as we have created it is a process o...,"change,deep-thoughts,thinking,world"
1,J.K. Rowling,"It is our choices, Harry, that show what we tr...","abilities,choices"
2,Albert Einstein,There are only two ways to live your life. One...,"inspirational,life,live,miracle,miracles"
3,Jane Austen,"The person, be it gentleman or lady, who has n...","aliteracy,books,classic,humor"
4,Marilyn Monroe,"Imperfection is beauty, madness is genius and ...","be-yourself,inspirational"
...,...,...,...
95,Harper Lee,You never really understand a person until you...,better-life-empathy
96,Madeleine L'Engle,You have to write the book that wants to be wr...,"books,children,difficult,grown-ups,write,write..."
97,Mark Twain,Never tell the truth to people who are not wor...,truth
98,Dr. Seuss,"A person's a person, no matter how small.”",inspirational


## Saving the Cleaned Data

In [27]:
data.to_csv('bw_project_cleaned.csv', index = False)
print("Data has been saved to .csv format and file name is: 'bw_project_cleaned.csv'")

Data has been saved to .csv format and file name is: 'bw_project_cleaned.csv'


## Insights from Dataset

### No. of Unique Authors and their names

In [31]:
data['Author'].nunique()

50

In [33]:
data['Author'].unique()

array(['Albert Einstein', 'J.K. Rowling', 'Jane Austen', 'Marilyn Monroe',
       'André Gide', 'Thomas A. Edison', 'Eleanor Roosevelt',
       'Steve Martin', 'Bob Marley', 'Dr. Seuss', 'Douglas Adams',
       'Elie Wiesel', 'Friedrich Nietzsche', 'Mark Twain',
       'Allen Saunders', 'Pablo Neruda', 'Ralph Waldo Emerson',
       'Mother Teresa', 'Garrison Keillor', 'Jim Henson',
       'Charles M. Schulz', 'William Nicholson', 'Jorge Luis Borges',
       'George Eliot', 'George R.R. Martin', 'C.S. Lewis',
       'Martin Luther King Jr.', 'James Baldwin', 'Haruki Murakami',
       'Alexandre Dumas fils', 'Stephenie Meyer', 'Ernest Hemingway',
       'Helen Keller', 'George Bernard Shaw', 'Charles Bukowski',
       'Suzanne Collins', 'J.R.R. Tolkien', 'Alfred Tennyson',
       'Terry Pratchett', 'J.D. Salinger', 'George Carlin', 'John Lennon',
       'W.C. Fields', 'Ayn Rand', 'Jimi Hendrix', 'J.M. Barrie',
       'E.E. Cummings', 'Khaled Hosseini', 'Harper Lee',
       "Madeleine L'E

### No of Unique Tags

In [36]:
all_tags = data["Tag_Names"].str.split(",")
tag_counts = pd.Series([tag.strip() for tags in all_tags for tag in tags]).nunique()
tag_counts

138

### Author who wrote maximum Quotes

In [39]:
no_ofQuotes = data.groupby(['Author'])['Quote'].count()
print(no_ofQuotes)

Author
Albert Einstein           10
Alexandre Dumas fils       1
Alfred Tennyson            1
Allen Saunders             1
André Gide                 1
Ayn Rand                   1
Bob Marley                 3
C.S. Lewis                 5
Charles Bukowski           2
Charles M. Schulz          1
Douglas Adams              1
Dr. Seuss                  6
E.E. Cummings              1
Eleanor Roosevelt          2
Elie Wiesel                1
Ernest Hemingway           2
Friedrich Nietzsche        1
Garrison Keillor           1
George Bernard Shaw        1
George Carlin              1
George Eliot               1
George R.R. Martin         2
Harper Lee                 1
Haruki Murakami            1
Helen Keller               1
J.D. Salinger              1
J.K. Rowling               9
J.M. Barrie                1
J.R.R. Tolkien             1
James Baldwin              1
Jane Austen                5
Jim Henson                 1
Jimi Hendrix               1
John Lennon                1
Jorge L

### Find the number of quotes by each author

In [42]:
no_ofQuotes = data.groupby(['Author'])['Quote'].count()
print(no_ofQuotes)

Author
Albert Einstein           10
Alexandre Dumas fils       1
Alfred Tennyson            1
Allen Saunders             1
André Gide                 1
Ayn Rand                   1
Bob Marley                 3
C.S. Lewis                 5
Charles Bukowski           2
Charles M. Schulz          1
Douglas Adams              1
Dr. Seuss                  6
E.E. Cummings              1
Eleanor Roosevelt          2
Elie Wiesel                1
Ernest Hemingway           2
Friedrich Nietzsche        1
Garrison Keillor           1
George Bernard Shaw        1
George Carlin              1
George Eliot               1
George R.R. Martin         2
Harper Lee                 1
Haruki Murakami            1
Helen Keller               1
J.D. Salinger              1
J.K. Rowling               9
J.M. Barrie                1
J.R.R. Tolkien             1
James Baldwin              1
Jane Austen                5
Jim Henson                 1
Jimi Hendrix               1
John Lennon                1
Jorge L

### List the top 5 most common tags.

In [45]:
# IF breaks the tag_counts part: 
# for tags in all_tags:
#    for tag in tags:
#        tag.strip()
# pd.Serise is a one-dimensional labeled array in Pandas that can store any data type, such as numbers, strings, or even Python objects. Counting values, Storing a single column of data separately, Performing calculations on a single column.

all_tags = data["Tag_Names"].str.split(",")
tag_counts = pd.Series([tag.strip() for tags in all_tags for tag in tags]).value_counts()
top_5_tags = tag_counts.head(5)
print(top_5_tags)

love             14
inspirational    13
life             13
humor            12
books            11
Name: count, dtype: int64


### Find authors who have more than 5 quotes.

In [48]:
more_than_fiveQuotes = data.groupby(['Author'])['Quote'].count()
sort_quotes = more_than_fiveQuotes.sort_values(ascending=False)
sort_quotes = sort_quotes[sort_quotes>5]
sort_quotes

Author
Albert Einstein    10
J.K. Rowling        9
Marilyn Monroe      7
Mark Twain          6
Dr. Seuss           6
Name: Quote, dtype: int64

### Retrieve the longest quote and its author.

In [51]:
data['length_quote'] = data["Quote"].str.split(' ').apply(len)
longest_quote= data.groupby(['Author'])['length_quote'].max()
longest_quote.sort_values(ascending = False)

Author
Marilyn Monroe            201
Bob Marley                138
C.S. Lewis                 92
Pablo Neruda               66
Jane Austen                59
Ralph Waldo Emerson        52
J.D. Salinger              47
J.K. Rowling               46
Charles Bukowski           44
Dr. Seuss                  41
Elie Wiesel                37
Helen Keller               31
Albert Einstein            31
Jim Henson                 27
James Baldwin              27
Madeleine L'Engle          27
Harper Lee                 27
John Lennon                25
Jimi Hendrix               25
Terry Pratchett            24
Garrison Keillor           24
Mark Twain                 22
Douglas Adams              22
Haruki Murakami            20
André Gide                 19
Eleanor Roosevelt          19
George R.R. Martin         19
Alfred Tennyson            18
Suzanne Collins            17
Mother Teresa              17
Friedrich Nietzsche        16
George Carlin              16
Ernest Hemingway           16
Ayn