## Pre-processing dataset using numpy and pandas
##### Drop the rows with null values
##### Dropping columns in a df
##### Changing the index of a df
##### Renaming the cols and skipping the rows
##### Performing Natural language processing techniques like removing stopwords,links,etc

In [1]:
#importing the libraries necessary for the pre-processing of the data.
import numpy as np
import pandas as pd


In [47]:
#load the dataset and display
dataset = pd.read_csv(r'D:\NLP Project\BL-Flickr-Images-Book.csv')
dataset.head()

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [48]:
#dropping unecessary cols that are not required (included in the list below)
to_drop = ['Edition Statement',
                 'Corporate Author',
                 'Corporate Contributors',
                 'Former owner',
                 'Engraver',
                 'Contributors',
                 'Issuance type',
                 'Shelfmarks']

dataset.drop(columns = to_drop, inplace = True)
dataset.head()

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [49]:
#changing the indexes for identifier col
dataset['Identifier'].is_unique

True

In [50]:
dataset = dataset.set_index('Identifier')
dataset.head()

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [51]:
dataset.iloc[206]

Place of Publication                                               London
Date of Publication                                                  1848
Publisher                                                 Richard Bentley
Title                   Rambles in the romantic regions of the Hartz M...
Author                                   Andersen, H. C. (Hans Christian)
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 77554, dtype: object

In [64]:
#remove the null values in the edition statement
dataset = dataset.dropna(subset=['Corporate Author'], inplace=True)
dataset.head()

KeyError: ['Corporate Author']

In [52]:
#renaming the col and skipping rows
#new dataset
olympics_df = pd.read_csv(r"C:\Users\yuvra\Downloads\olympics.csv")
olympics_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [33]:
new_names = {'0': 'Country',
        '1': 'Summer Olympics',
        '2': 'Gold',
        '3': 'Silver',
        '4': 'Bronze',
        '5': 'Winter Olympics',
        '6': 'Gold',
        '7': 'Silver',
        '8': 'Bronze',
        '9': '# Games',
        '10': 'Gold',
        '11': 'Silver',
        '12': 'Bronze'}

In [53]:

olympics_df.rename(columns= new_names, inplace=True)
olympics_df.head()

Unnamed: 0,Country,Summer Olympics,Gold,Silver,Bronze,Winter Olympics,Gold.1,Silver.1,Bronze.1,# Games,Gold.2,Silver.2,Bronze.2,13,14,15
0,,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [2]:
#Performing Natural language processing techniques like removing stopwords,links,etc
import nltk
from nltk.corpus import stopwords

In [3]:
# Load the dataset
dataset = pd.read_csv(r'C:/Users/yuvra/Downloads/archive (1)/india-news-headlines.csv')
dataset.head()
    

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3876557 entries, 0 to 3876556
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 88.7+ MB


In [5]:
dataset.shape

(3876557, 3)

In [6]:
# Remove null values
dataset = dataset.dropna(subset=['publish_date', 'headline_category', 'headline_text'])
dataset.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [7]:
# Get stopwords from NLTK
stop_words = set(stopwords.words('english'))


In [8]:
# Preprocess and remove stopwords from 'headline_text'
from nltk.tokenize import word_tokenize
dataset['headline_text'] = dataset['headline_text'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(str(x)) if word.lower() not in stop_words]))


In [10]:
print("\n Preprocessing 'headline_text' column :")
dataset['headline_text']


 Preprocessing 'headline_text' column :


0               status quo disturbed ayodhya ; says vajpayee
1                                fissures hurriyat pak visit
2                        america 's unwanted heading india ?
3                                  bigwigs ; destination goa
4                          extra buses clear tourist traffic
                                 ...                        
3876552                    10 pis move hc thwarted seniority
3876553    govt notifies award memory parrikar young scie...
3876554    youth 's death ; pwd installs crash barriers k...
3876555                    authorities acting crz violations
3876556           technicians hold trial run mini-evs panaji
Name: headline_text, Length: 3876557, dtype: object