In [3]:
import pandas as pd
import numpy as np

# Create Dataframes

In [14]:
myData = np.arange(0,24).reshape(6,4)
print(myData)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]


In [15]:
df = pd.DataFrame(myData, 
                  index = ["Row1", "Row2", "Row3", "Row4", "Row5", "Row6"], 
                  columns = ["Column1", "Column2", "Column3", "Column4"])
df

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19
Row6,20,21,22,23


In [16]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [17]:
df.tail()

Unnamed: 0,Column1,Column2,Column3,Column4
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19
Row6,20,21,22,23


In [19]:
df.head(4)

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15


In [21]:
type(df)

pandas.core.frame.DataFrame

## Inbuilt functions of dataframes

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Row1 to Row6
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Column1  6 non-null      int64
 1   Column2  6 non-null      int64
 2   Column3  6 non-null      int64
 3   Column4  6 non-null      int64
dtypes: int64(4)
memory usage: 240.0+ bytes


In [24]:
df.describe()

Unnamed: 0,Column1,Column2,Column3,Column4
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.483315,7.483315,7.483315,7.483315
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


## Indexing
## columnname, rowindex[loc], rowindex columnindex number[.iloc]


## index using column name

In [30]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [29]:
df["Column1"]


Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Row6    20
Name: Column1, dtype: int64

## When we combine more than one column or more than one rows, then we get dataframe.
## When we want to get data of only one column or only one row, then we get series.

In [26]:
df[["Column1", "Column2", "Column3"]]

Unnamed: 0,Column1,Column2,Column3
Row1,0,1,2
Row2,4,5,6
Row3,8,9,10
Row4,12,13,14
Row5,16,17,18
Row6,20,21,22


## row index using loc parameter

In [34]:
df.loc["Row3"]

Column1     8
Column2     9
Column3    10
Column4    11
Name: Row3, dtype: int64

In [36]:
df.loc[["Row2", "Row3"]]

Unnamed: 0,Column1,Column2,Column3,Column4
Row2,4,5,6,7
Row3,8,9,10,11


## Indexing using columnname and rowname using iloc parameter

In [37]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [38]:
df.iloc[2:4,0:2]

Unnamed: 0,Column1,Column2
Row3,8,9
Row4,12,13


In [40]:
df.iloc[1:3,1:]

Unnamed: 0,Column2,Column3,Column4
Row2,5,6,7
Row3,9,10,11


In [41]:
df[["Column1","Column4"]]

Unnamed: 0,Column1,Column4
Row1,0,3
Row2,4,7
Row3,8,11
Row4,12,15
Row5,16,19
Row6,20,23


## Convertng dataframes to arrays

In [46]:
df.iloc[0:4,0:2].values

array([[ 0,  1],
       [ 4,  5],
       [ 8,  9],
       [12, 13]])

# Operations

## basic operations

In [55]:
data = np.array([
    [1,np.nan,3],
    [4,5,np.nan],
    [6,7,8]
])

df = pd.DataFrame(data,
                  index = ["Row1", "Row2", "Row3"], 
                  columns = ["Column1", "Column2", "Column3"])

In [56]:
df

Unnamed: 0,Column1,Column2,Column3
Row1,1.0,,3.0
Row2,4.0,5.0,
Row3,6.0,7.0,8.0


In [57]:
df.isnull().sum()

Column1    0
Column2    1
Column3    1
dtype: int64

In [62]:
df.isnull().sum()==0

Column1     True
Column2    False
Column3    False
dtype: bool

In [63]:
df["Column3"].value_counts()

Column3
3.0    1
8.0    1
Name: count, dtype: int64

In [66]:
df[df["Column2"]>2]

Unnamed: 0,Column1,Column2,Column3
Row2,4.0,5.0,
Row3,6.0,7.0,8.0


# Reading CSV file using pandas and StringIO

In [69]:
from io import StringIO
import pandas as pd

In [86]:
df = pd.read_csv(("Dataset/goalscorers.csv"))

In [87]:
df.head(10)

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False
5,1916-07-06,Argentina,Chile,Chile,Telésforo Báez,44.0,False,False
6,1916-07-06,Argentina,Chile,Argentina,Juan Domingo Brown,60.0,False,True
7,1916-07-06,Argentina,Chile,Argentina,Juan Domingo Brown,62.0,False,True
8,1916-07-06,Argentina,Chile,Argentina,Alberto Marcovecchio,67.0,False,False
9,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,75.0,False,False


## converting string data format to csv format

In [88]:
myData = ('age,name,city\n'
          '23,Abhishek Sharma, Kolkata\n'
          '24,Sachin Tendulkar, Mumbai\n'
         )
pd.read_csv(StringIO(myData))

Unnamed: 0,age,name,city
0,23,Abhishek Sharma,Kolkata
1,24,Sachin Tendulkar,Mumbai


In [98]:
mydf = pd.read_csv("Dataset/goalscorers.csv", usecols=["home_team","away_team","minute","penalty"]).head(15)
mydf

Unnamed: 0,home_team,away_team,minute,penalty
0,Chile,Uruguay,44.0,False
1,Chile,Uruguay,55.0,False
2,Chile,Uruguay,70.0,False
3,Chile,Uruguay,75.0,False
4,Argentina,Chile,2.0,False
5,Argentina,Chile,44.0,False
6,Argentina,Chile,60.0,True
7,Argentina,Chile,62.0,True
8,Argentina,Chile,67.0,False
9,Argentina,Chile,75.0,False


## Coverting data from dataframe to csv

In [99]:
mydf.to_csv("test.csv",index = False)

In [100]:
newdf = pd.read_csv("test.csv")

In [101]:
newdf

Unnamed: 0,home_team,away_team,minute,penalty
0,Chile,Uruguay,44.0,False
1,Chile,Uruguay,55.0,False
2,Chile,Uruguay,70.0,False
3,Chile,Uruguay,75.0,False
4,Argentina,Chile,2.0,False
5,Argentina,Chile,44.0,False
6,Argentina,Chile,60.0,True
7,Argentina,Chile,62.0,True
8,Argentina,Chile,67.0,False
9,Argentina,Chile,75.0,False


In [107]:
df["penalty"] = (df["penalty"] == True).astype(int)

In [108]:
df.head(15)

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,1
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,1
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,1
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,1
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,1
5,1916-07-06,Argentina,Chile,Chile,Telésforo Báez,44.0,False,1
6,1916-07-06,Argentina,Chile,Argentina,Juan Domingo Brown,60.0,False,0
7,1916-07-06,Argentina,Chile,Argentina,Juan Domingo Brown,62.0,False,0
8,1916-07-06,Argentina,Chile,Argentina,Alberto Marcovecchio,67.0,False,1
9,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,75.0,False,1


In [121]:
pd.read_csv("test.csv", usecols=["home_team","away_team"], index_col = False)

Unnamed: 0,home_team,away_team
0,Chile,Uruguay
1,Chile,Uruguay
2,Chile,Uruguay
3,Chile,Uruguay
4,Argentina,Chile
5,Argentina,Chile
6,Argentina,Chile
7,Argentina,Chile
8,Argentina,Chile
9,Argentina,Chile
