# **DataFrame Basics II**

### **Filtering DataFrames by one Condition**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.dtypes

In [None]:
titanic.age.head()

In [None]:
titanic.sex == "male"

In [None]:
males1 = titanic[titanic.sex == "male"]

In [None]:
males1.head()

In [None]:
male_filter = titanic.sex == "male"
males2 = titanic.loc[male_filter]

In [None]:
males2.head()

In [None]:
males3 = titanic.loc[male_filter, ["age", "fare"]]

In [None]:
males3.head()

In [None]:
female_filter = titanic.sex == "female"
females = titanic[female_filter]

In [None]:
females

In [None]:
males2.dtypes

In [None]:
object_filter = males2.dtypes == object
males2_numbers = males2.loc[:, ~object_filter]

In [None]:
males2_numbers

In [None]:
males = titanic.loc[male_filter, ~object_filter]

In [None]:
males.head()

### **Filtering DataFrames by many Condition (AND)**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
males_filter = titanic['sex'] == "male"
males = titanic.loc[males_filter]
males.head()

In [None]:
age_filter = titanic['age'] > 14
grown_ups = titanic.loc[age_filter]
grown_ups.head()

In [None]:
grown_males = titanic.loc[males_filter & age_filter]
grown_males.head()

In [None]:
grown_males.describe()

### **Filtering DataFrames by many Condition (OR)**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')
titanic.head()

In [None]:
females_filter = titanic.sex == 'female'
females_filter.head()

In [None]:
child_filter = titanic['age'] < 14
child_filter.head()

In [None]:
woman_or_child = titanic.loc[females_filter | child_filter]

In [None]:
woman_or_child.head()

In [None]:
woman_or_child.describe()

In [None]:
pd.read_csv('summer.csv')

## **Advanced Filtering with between(), isin() and ~**

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv('summer.csv')

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
games_1988 = summer.loc[summer.Year == 1988]

In [None]:
games_1988.head()

In [None]:
games_1988.info()

In [None]:
games_since_1992 = summer.loc[summer.Year >= 1992]

In [None]:
games_since_1992

In [None]:
games_since_1992.head()

In [None]:
games_since_1992.tail()

In [None]:
games_90s = summer[summer.Year.between(left = 1990, right = 1999, inclusive = 'both')]

In [None]:
games_90s

In [None]:
selected_games = summer.loc[summer.Year.isin([1976, 1988])]

In [None]:
selected_games

In [None]:
discipline = selected_games.Discipline == 'Weightlifting'
country = selected_games.Country == 'BUL'
selected_games.loc[discipline & country]

## **any()** and **all()**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
(titanic.sex == 'male').any()

In [None]:
(titanic.sex == 'male').all()

In [None]:
(titanic.age == 80.0).any()

In [None]:
titanic.loc[titanic.age == 80.0]

In [None]:
pd_series = pd.Series([1, 5, -5, 0, -1, 0.5])

In [None]:
pd_series.any()

In [None]:
pd_series.all()

In [None]:
fares = titanic.fare.unique()

In [None]:
fares.sort()

In [None]:
fares = pd.Series(fares)

In [None]:
fares.head()

In [None]:
titanic.fare.all()

## **Removing columns**

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv('summer.csv')

In [None]:
summer.head()

In [None]:
summer.drop(columns = 'Sport')

In [None]:
summer.head()

In [None]:
summer.drop(columns = ['Sport', 'Discipline'])

In [None]:
summer.head()

In [None]:
summer.drop(labels = 'Event', axis = 1)
# summer.drop(labels = 'Event', axis = "columns") - alternative approach

In [None]:
summer.head()

In [None]:
del summer['Sport']

In [None]:
summer.head()

In [None]:
summer = pd.read_csv('summer.csv')

In [None]:
summer.head()

In [None]:
summer.loc[ :, ['Year', 'City', 'Athlete', 'Country', 'Gender', 'Event', 'Medal']]

## **Removing rows**

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv('summer.csv',index_col = 'Athlete')

In [None]:
summer.head(10)

In [None]:
summer.drop(index = 'HAJOS, Alfred').head(10)

In [None]:
summer.drop(index = ['HAJOS, Alfred', 'DRIVAS, Dimitrios']).head(10)

In [None]:
summer.drop(labels = "HERSCHMANN, Otto", axis = 0).head(10)

In [None]:
summer.loc[summer.Year == 1996]

In [None]:
filter1 = summer.Year == 1996
filter2 = summer.Sport == 'Aquatics'
summer.loc[~(filter1 | filter2)]

## **Adding new Columns to a DataFrame**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic['Zeros']

In [None]:
titanic['Zeros'] = 0

In [None]:
titanic.head()

In [None]:
titanic['Zeros_2'] = '0'

In [None]:
titanic.head()

In [None]:
titanic.dtypes

In [None]:
titanic.Ones

In [None]:
titanic.Ones = 1

In [None]:
titanic.head()

In [None]:
titanic.Ones

## **Creating Columns based on other Columns**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
1912 - titanic.age

In [None]:
titanic['YoB'] = 1912 - titanic.age

In [None]:
titanic.head()

In [None]:
titanic['relatives'] = titanic['sibsp'] + titanic['parch']

In [None]:
titanic.head()

In [None]:
titanic.drop(columns = ['sibsp', 'parch'], inplace = True)

In [None]:
titanic.head()

In [None]:
inflation_factor = 10

In [None]:
titanic['ia_fare'] = titanic.fare * inflation_factor

In [None]:
titanic.head()

## **Adding Columns with insert()**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
relatives = titanic['sibsp'] + titanic['parch']

In [None]:
relatives.head()

In [None]:
titanic.insert(column = 'relatives', value = relatives, loc = 6)

In [None]:
titanic.head()

## **Creating DataFrames from Scratch with pd.DataFrame()**

In [4]:
import pandas as pd

In [5]:
player = ["Lionel Messi", "Cristiano Ronaldo", "Neymar Junior", "Kylian Mbappe", "Manuel Neuer"]

In [6]:
nationality = ["Argentina", "Portugal", "Brasil", "France", "Germany"]

In [7]:
club = ["FC Barcelona", "Juventus FC", "Paris SG", "Paris SG", "FC Bayern" ]

In [8]:
world_champion = [False, False, False, True, True]

In [9]:
height = [1.70, 1.87, 1.75, 1.78, 1.93]

In [10]:
goals = [45, 44, 28, 21, 0]

In [11]:
dic = {"Player":player, "Nationality":nationality, "Club":club, 
        "World_Champion":world_champion, "Height":height, "Goals_2018":goals
       }

In [12]:
dic

{'Player': ['Lionel Messi',
  'Cristiano Ronaldo',
  'Neymar Junior',
  'Kylian Mbappe',
  'Manuel Neuer'],
 'Nationality': ['Argentina', 'Portugal', 'Brasil', 'France', 'Germany'],
 'Club': ['FC Barcelona', 'Juventus FC', 'Paris SG', 'Paris SG', 'FC Bayern'],
 'World_Champion': [False, False, False, True, True],
 'Height': [1.7, 1.87, 1.75, 1.78, 1.93],
 'Goals_2018': [45, 44, 28, 21, 0]}

In [13]:
df = pd.DataFrame(data = dic)

In [14]:
df

Unnamed: 0,Player,Nationality,Club,World_Champion,Height,Goals_2018
0,Lionel Messi,Argentina,FC Barcelona,False,1.7,45
1,Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44
2,Neymar Junior,Brasil,Paris SG,False,1.75,28
3,Kylian Mbappe,France,Paris SG,True,1.78,21
4,Manuel Neuer,Germany,FC Bayern,True,1.93,0


In [15]:
players = df.set_index('Player')

In [16]:
players

Unnamed: 0_level_0,Nationality,Club,World_Champion,Height,Goals_2018
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lionel Messi,Argentina,FC Barcelona,False,1.7,45
Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44
Neymar Junior,Brasil,Paris SG,False,1.75,28
Kylian Mbappe,France,Paris SG,True,1.78,21
Manuel Neuer,Germany,FC Bayern,True,1.93,0


In [17]:
raw_data = list(zip(nationality, club, world_champion, height, goals))

In [18]:
raw_data

[('Argentina', 'FC Barcelona', False, 1.7, 45),
 ('Portugal', 'Juventus FC', False, 1.87, 44),
 ('Brasil', 'Paris SG', False, 1.75, 28),
 ('France', 'Paris SG', True, 1.78, 21),
 ('Germany', 'FC Bayern', True, 1.93, 0)]

In [19]:
messi, ronaldo, neymar, mbappe, neuer = raw_data

In [20]:
messi

('Argentina', 'FC Barcelona', False, 1.7, 45)

In [21]:
headers = ["Nationality", "Club", "World_Champion", "Height", "Goals_2018"]

In [22]:
df_1 = pd.DataFrame(data = raw_data, index = player, columns = headers)

In [23]:
df_1

Unnamed: 0,Nationality,Club,World_Champion,Height,Goals_2018
Lionel Messi,Argentina,FC Barcelona,False,1.7,45
Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44
Neymar Junior,Brasil,Paris SG,False,1.75,28
Kylian Mbappe,France,Paris SG,True,1.78,21
Manuel Neuer,Germany,FC Bayern,True,1.93,0


In [24]:
df_2 = pd.DataFrame(data = [messi, ronaldo, neymar, mbappe, neuer], index = player, columns = headers)

In [25]:
df_2

Unnamed: 0,Nationality,Club,World_Champion,Height,Goals_2018
Lionel Messi,Argentina,FC Barcelona,False,1.7,45
Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44
Neymar Junior,Brasil,Paris SG,False,1.75,28
Kylian Mbappe,France,Paris SG,True,1.78,21
Manuel Neuer,Germany,FC Bayern,True,1.93,0


In [26]:
df_3 = pd.Series(index = player, data = nationality, name = 'Nationality').to_frame()

In [27]:
df_3

Unnamed: 0,Nationality
Lionel Messi,Argentina
Cristiano Ronaldo,Portugal
Neymar Junior,Brasil
Kylian Mbappe,France
Manuel Neuer,Germany


In [28]:
df_3['Club'] = club

In [29]:
df_3

Unnamed: 0,Nationality,Club
Lionel Messi,Argentina,FC Barcelona
Cristiano Ronaldo,Portugal,Juventus FC
Neymar Junior,Brasil,Paris SG
Kylian Mbappe,France,Paris SG
Manuel Neuer,Germany,FC Bayern


## **Adding new Rows (hands-on approach)**

In [30]:
players.reset_index(inplace = True)

In [31]:
players

Unnamed: 0,Player,Nationality,Club,World_Champion,Height,Goals_2018
0,Lionel Messi,Argentina,FC Barcelona,False,1.7,45
1,Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44
2,Neymar Junior,Brasil,Paris SG,False,1.75,28
3,Kylian Mbappe,France,Paris SG,True,1.78,21
4,Manuel Neuer,Germany,FC Bayern,True,1.93,0


In [32]:
players.loc[5, :] = ['Berbatov', 'Bulgaria', 'Man Utd', 'False', 1.88, 0]

In [33]:
players

Unnamed: 0,Player,Nationality,Club,World_Champion,Height,Goals_2018
0,Lionel Messi,Argentina,FC Barcelona,False,1.7,45.0
1,Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44.0
2,Neymar Junior,Brasil,Paris SG,False,1.75,28.0
3,Kylian Mbappe,France,Paris SG,True,1.78,21.0
4,Manuel Neuer,Germany,FC Bayern,True,1.93,0.0
5,Berbatov,Bulgaria,Man Utd,False,1.88,0.0


In [34]:
new = pd.DataFrame(data = [['Stoichkov', 'Bulgaria', ' FC Barcelona', 'False', 1.80, 0],
                           ['van Basten', 'Netherlands', 'FC Milan', 'False', 1.90, 0]],
                   columns = players.columns)

In [35]:
new

Unnamed: 0,Player,Nationality,Club,World_Champion,Height,Goals_2018
0,Stoichkov,Bulgaria,FC Barcelona,False,1.8,0
1,van Basten,Netherlands,FC Milan,False,1.9,0


In [42]:
mixed_df = pd.concat([players, new], ignore_index = True)

In [43]:
mixed_df

Unnamed: 0,Player,Nationality,Club,World_Champion,Height,Goals_2018
0,Lionel Messi,Argentina,FC Barcelona,False,1.7,45.0
1,Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44.0
2,Neymar Junior,Brasil,Paris SG,False,1.75,28.0
3,Kylian Mbappe,France,Paris SG,True,1.78,21.0
4,Manuel Neuer,Germany,FC Bayern,True,1.93,0.0
5,Berbatov,Bulgaria,Man Utd,False,1.88,0.0
6,Stoichkov,Bulgaria,FC Barcelona,False,1.8,0.0
7,van Basten,Netherlands,FC Milan,False,1.9,0.0


In [38]:
mixed_df.set_index('Player', inplace = True)

In [39]:
mixed_df

Unnamed: 0_level_0,Nationality,Club,World_Champion,Height,Goals_2018
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lionel Messi,Argentina,FC Barcelona,False,1.7,45.0
Cristiano Ronaldo,Portugal,Juventus FC,False,1.87,44.0
Neymar Junior,Brasil,Paris SG,False,1.75,28.0
Kylian Mbappe,France,Paris SG,True,1.78,21.0
Manuel Neuer,Germany,FC Bayern,True,1.93,0.0
Berbatov,Bulgaria,Man Utd,False,1.88,0.0
Stoichkov,Bulgaria,FC Barcelona,False,1.8,0.0
van Basten,Netherlands,FC Milan,False,1.9,0.0
