In [1]:
import pandas as pd
import numpy as np

## CSV FILE
The CSV file format is a popular format supported by many machine learning frameworks. The format is variously referred to "comma-separated values" or "character-separated values."

A CSV file stores tabular data (numbers and text) in plain text form. A CSV file consists of any number of records, separated by line breaks of some kind. Each record consists of fields, separated by a literal comma. In some regions, the separator might be a semi-colon.

Typically, all records have an identical number of fields, and missing values are represented as nulls or empty strings. There are a number of ways to load a CSV file in Python. 

# loading CSV local file

In [2]:
df1 = pd.read_csv('aug_train.csv')

In [3]:
df1

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


![image.png](attachment:50495b0f-6ab6-44f6-9d38-d00bbb195bd3.png)

## Sep Parameter

used to read files other than csv

In [4]:
df2 = pd.read_csv('movie_titles_metadata.tsv',sep='\t')
df2

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...
...,...,...,...,...,...,...
611,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
612,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
613,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
614,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']


## names parameter
in the above example , the column names are missing instead of that data of row 1 is placed 
to rectify this we use names parameter
names parameter assigns column names when needed

In [5]:
df2 = pd.read_csv('movie_titles_metadata.tsv',sep = '\t' , names = ['s.no.','movie title' , 'release date' , 'rating' , 'collection' , 'genres'])
df2

Unnamed: 0,s.no.,movie title,release date,rating,collection,genres
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
...,...,...,...,...,...,...
612,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
613,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
614,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
615,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']


## index_col parameter
used to change the by default index column by mentioned column

In [6]:
df2 = pd.read_csv('aug_train.csv',index_col = 'enrollee_id')
df2

Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


## header parameter

![image.png](attachment:a5cb1993-cfad-47ea-973e-ed10b776efe8.png)

## use_cols parameter
to print some colomns which are mentioned

In [7]:
df3 = pd.read_csv('aug_train.csv' , usecols = ['city','gender','education_level'])
df3

Unnamed: 0,city,gender,education_level
0,city_103,Male,Graduate
1,city_40,Male,Graduate
2,city_21,,Graduate
3,city_115,,Graduate
4,city_162,Male,Masters
...,...,...,...
19153,city_173,Male,Graduate
19154,city_103,Male,Graduate
19155,city_103,Male,Graduate
19156,city_65,Male,High School


## skiprows/nrows parameter
skiprows = skip the rows mentioned e.g. skiprows[0,1]

nrows = if nrows(100) is present then only 100 rows will be printed


In [8]:
df2 = pd.read_csv('aug_train.csv')
df3 = pd.read_csv('aug_train.csv' , skiprows = [0,1])

df4 = pd.read_csv('aug_train.csv', nrows = 100)
df4

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,12081,city_65,0.802,Male,Has relevent experience,Full time course,Graduate,STEM,9,50-99,Pvt Ltd,1,33,0.0
96,7364,city_160,0.920,,No relevent experience,Full time course,High School,,2,100-500,Pvt Ltd,1,142,0.0
97,11184,city_74,0.579,,No relevent experience,Full time course,Graduate,STEM,2,100-500,Pvt Ltd,1,34,0.0
98,7016,city_65,0.802,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Pvt Ltd,2,14,1.0


## encoding parameter
changes the encoding if it differs from UTF-8

## Setting Data Types with dtype
By default, pandas infers the datatype of each column, but sometimes this can be inefficient (e.g., storing numeric values as strings or objects).

In [15]:
df1 = pd.read_csv('aug_train.csv',dtype = {'target':'int32'}) 
df1

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0


## Parsing Dates with parse_dates
By default, pandas treats dates as object types. To use datetime operations, explicitly parse them:


In [32]:
df = pd.read_csv('IPL-2023 Data set.csv',parse_dates = ['Date'],dayfirst = True)
df.head()


Unnamed: 0,Match,Date,Teams,Team1,Team2,Ground,Venue,Toss,Toss_winner,Toss_decision,First_Innings,Second_Innings,MOM,Match_Conditions,Winner,Won_by,Result
0,1,31/03/2023,Gujarat Titans v Chennai Super Kings,Gujarat Titans,Chennai Super Kings,Ahmedabad,Narendra Modi Stadium,Gujarat Titans won the toss and opt to bowl,Gujarat Titans,Bowl,178,182,Rashid Khan,Night,Gujarat Titans,by 5 Wickets,Gujarat Titans won by 5 Wickets
1,2,01/04/2023,Kolkata Knight Riders v Punjab Kings,Kolkata Knight Riders,Punjab Kings,Mohali,Punjab Cricket Association IS Bindra Stadium,Kolkata Knight Riders won the toss and opt to ...,Kolkata Knight Riders,Bowl,191,146,Arshdeep Singh,Day/Night,Punjab Kings,by 7 Runs [D/L],Punjab Kings won by 7 Runs [D/L]
2,3,01/04/2023,Lucknow Super Giants v Delhi Capitals,Lucknow Super Giants,Delhi Capitals,Lucknow,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Delhi Capitals won the toss and opt to bowl,Delhi Capitals,Bowl,193,143,Mark Wood,Night,Lucknow Super Giants,by 50 Runs,Lucknow Super Giants won by 50 Runs
3,4,02/04/2023,Rajasthan Royals v Sunrisers Hyderabad,Rajasthan Royals,Sunrisers Hyderabad,Hyderabad,Rajiv Gandhi International Stadium,Sunrisers Hyderabad won the toss and opt to bowl,Sunrisers Hyderabad,Bowl,203,131,Jos Buttler,Day/Night,Rajasthan Royals,by 72 Runs,Rajasthan Royals won by 72 Runs
4,5,02/04/2023,Mumbai Indians v Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,Bengaluru,M.Chinnaswamy Stadium,Royal Challengers Bangalore won the toss and o...,Royal Challengers Bangalore,Bowl,171,172,Faf du Plessis,Night,Royal Challengers Bangalore,by 8 Wickets,Royal Challengers Bangalore won by 8 Wickets


## Applying Custom Functions with converters
Use converters to apply a transformation on-the-fly while reading the CSV.


In [39]:
def team_name_change(name):
    if name == 'Gujarat Titans':
        return 'GT'
    elif name == 'Punjab Kings	':
        return 'pk'
    else:
        return name
        

In [43]:
df1 = pd.read_csv('IPL-2023 Data set.csv',converters={'team2':team_name_change})
df1.head()

Unnamed: 0,Match,Date,Teams,Team1,Team2,Ground,Venue,Toss,Toss_winner,Toss_decision,First_Innings,Second_Innings,MOM,Match_Conditions,Winner,Won_by,Result
0,1,31/03/2023,Gujarat Titans v Chennai Super Kings,Gujarat Titans,Chennai Super Kings,Ahmedabad,Narendra Modi Stadium,Gujarat Titans won the toss and opt to bowl,Gujarat Titans,Bowl,178,182,Rashid Khan,Night,Gujarat Titans,by 5 Wickets,Gujarat Titans won by 5 Wickets
1,2,01/04/2023,Kolkata Knight Riders v Punjab Kings,Kolkata Knight Riders,Punjab Kings,Mohali,Punjab Cricket Association IS Bindra Stadium,Kolkata Knight Riders won the toss and opt to ...,Kolkata Knight Riders,Bowl,191,146,Arshdeep Singh,Day/Night,Punjab Kings,by 7 Runs [D/L],Punjab Kings won by 7 Runs [D/L]
2,3,01/04/2023,Lucknow Super Giants v Delhi Capitals,Lucknow Super Giants,Delhi Capitals,Lucknow,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Delhi Capitals won the toss and opt to bowl,Delhi Capitals,Bowl,193,143,Mark Wood,Night,Lucknow Super Giants,by 50 Runs,Lucknow Super Giants won by 50 Runs
3,4,02/04/2023,Rajasthan Royals v Sunrisers Hyderabad,Rajasthan Royals,Sunrisers Hyderabad,Hyderabad,Rajiv Gandhi International Stadium,Sunrisers Hyderabad won the toss and opt to bowl,Sunrisers Hyderabad,Bowl,203,131,Jos Buttler,Day/Night,Rajasthan Royals,by 72 Runs,Rajasthan Royals won by 72 Runs
4,5,02/04/2023,Mumbai Indians v Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,Bengaluru,M.Chinnaswamy Stadium,Royal Challengers Bangalore won the toss and o...,Royal Challengers Bangalore,Bowl,171,172,Faf du Plessis,Night,Royal Challengers Bangalore,by 8 Wickets,Royal Challengers Bangalore won by 8 Wickets
