# **Importing data**

## **Importing clean .csv files**

In [None]:
with open('titanic.csv') as f:
    text = f.readlines()

In [None]:
text

In [None]:
len(text)

In [None]:
import pandas as pd

In [None]:
pd.read_csv('titanic.csv')

In [None]:
pd.read_csv('titanic.csv', index_col = 'pclass')

In [None]:
pd.read_csv('titanic.csv', header = 0)

In [None]:
pd.read_csv('titanic.csv', header = None)

In [None]:
pd.read_csv('titanic.csv', header = 0, names = ['alive', 'class', 'gender', 'age', 'sibsp', 'parch', 'price', 'emb', 'deck'])

In [None]:
pd.read_csv('titanic.csv', header = 0, usecols = ['age', 'sex'])

In [None]:
df = pd.read_csv('titanic.csv', header = 0, index_col = 'pclass', usecols = ['pclass', 'age', 'sex'])

In [None]:
df.head()

In [None]:
df.columns = ['gender', 'age']

In [None]:
df.head()

In [None]:
df.index.name = 'class'

In [None]:
df.head()

## **Importing messy .csv files**

In [None]:
import pandas as pd

In [None]:
pd.read_csv('titanic_raw.csv')

In [None]:
pd.read_csv('titanic_raw.csv', skiprows = 3, skipfooter = 2)

In [None]:
col_names = ['Survived', 'Class', 'Gender', 'Age', 'SipSp', 'ParCh', 'Fare', 'Emb', 'Deck']

In [None]:
titanic = pd.read_csv('titanic_raw.csv', skiprows = 3, skipfooter = 2, header = None, names = col_names)

In [None]:
titanic.to_csv('titanic_imp.csv', index = False)

In [None]:
pd.read_csv('titanic_imp.csv')

## **Importing clean .xls(x) files**

In [None]:
import pandas as pd

In [None]:
sales = pd.read_excel('sales.xls')

In [None]:
sales.info()

In [None]:
sales.describe()

In [None]:
sales.head()

In [None]:
pd.read_excel('sales.xls', index_col = 0)

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0)

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, names = ['Name', 'Loc_City', 'Loc_Country', 'Revenue', 'Add_Comp'])

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = 'A')

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = 'A:C')

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = 'A, C:E')

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = ':C')

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = [0, 3, 4])

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = [0])

In [None]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = ['Sales', 'Bonus'])

## **Importing messy .xls(x) files**

In [None]:
import pandas as pd

In [None]:
pd.read_excel('summer_raw.xls')

In [None]:
# pd.read_excel('summer_raw.xls', sheet_name = 'summer', header = 2, usecols = list(range(2, 10)), index_col = 0)

In [None]:
pd.read_excel('summer_raw.xls')

In [None]:
pd.read_excel('summer_raw.xls', sheet_name = 'summer') 
# Alternative
# pd.read_excel('summer_raw.xls', sheet_name = 1)

In [None]:
pd.read_excel('summer_raw.xls', sheet_name = 'summer', skiprows = 2) 

In [None]:
pd.read_excel('summer_raw.xls', sheet_name = 'summer', skiprows = 2, usecols = "D:L")
# Alternative
# pd.read_excel('summer_raw.xls', sheet_name = 'summer', skiprows = 2, usecols = "C:L", index_col = 0)

In [None]:
summer = pd.read_excel('summer_raw.xls', sheet_name = 'summer', skiprows = 2, usecols = "D:L")

In [None]:
summer.info()

In [None]:
summer.describe()

In [None]:
summer.head()

In [None]:
summer.to_csv('summer_imp.csv', index = False)

In [None]:
summer.to_excel('summer_imp.xlsx', index = False, engine = 'xlsxwriter')

## **Importing Data from the Web**

In [1]:
import pandas as pd

In [17]:
url_1 = 'https://en.wikipedia.org/wiki/1976_Summer_Olympics_medal_table'

In [18]:
sog_1976 = pd.read_html(url, index_col = 0)[3]

In [19]:
sog_1976.head()

Unnamed: 0_level_0,NOC,Gold,Silver,Bronze,Total
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Soviet Union,49,41,35,125
2,East Germany,40,25,25,90
3,United States,34,35,25,94
4,West Germany,10,12,17,39
5,Japan,9,6,10,25


In [20]:
sog_1976.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42 entries, 1 to Totals (41 entries)
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NOC     42 non-null     object
 1   Gold    42 non-null     int64 
 2   Silver  42 non-null     int64 
 3   Bronze  42 non-null     int64 
 4   Total   42 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 2.0+ KB


In [21]:
sog_1976.describe()

Unnamed: 0,Gold,Silver,Bronze,Total
count,42.0,42.0,42.0,42.0
mean,9.428571,9.47619,10.285714,29.190476
std,31.634454,31.209941,33.44328,96.038925
min,0.0,0.0,0.0,1.0
25%,0.0,1.0,1.0,2.0
50%,1.0,2.0,2.5,5.0
75%,4.0,5.0,5.75,13.0
max,198.0,199.0,216.0,613.0


In [34]:
sog_1976.to_csv('sog_1976.csv', index = None)

In [23]:
url2 = 'https://en.wikipedia.org/wiki/1996_Summer_Olympics_medal_table'

In [30]:
sog_1996 = pd.read_html(url2)[3]

In [31]:
sog_1996.head()

Unnamed: 0,Rank,Nation,Gold,Silver,Bronze,Total
0,1,United States*,44,32,25,101
1,2,Russia,26,21,16,63
2,3,Germany,20,18,27,65
3,4,China,16,22,12,50
4,5,France,15,7,15,37


In [32]:
sog_1996.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rank    80 non-null     object
 1   Nation  80 non-null     object
 2   Gold    80 non-null     int64 
 3   Silver  80 non-null     int64 
 4   Bronze  80 non-null     int64 
 5   Total   80 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 3.9+ KB


In [33]:
sog_1996.describe()

Unnamed: 0,Gold,Silver,Bronze,Total
count,80.0,80.0,80.0,80.0
mean,6.775,6.825,7.45,21.05
std,30.621402,30.648177,33.377359,94.461418
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,1.0,1.0
50%,1.0,1.0,1.0,4.0
75%,4.0,4.0,5.0,15.0
max,271.0,273.0,298.0,842.0


In [35]:
sog_1996.to_csv('sog_1996.csv', index = None)