# Zajęcia 22: Wprowadzenie do Pandas

## Instalacja i importowanie Pandas

In [2]:
!pip install pandas



In [4]:
import pandas as pd

# Series - podstawowy typ danych w Pandas
Series to jednowymiarowa struktura danych, przypominająca listę lub słownik.

In [5]:
# Tworzenie Series z domyślnym indeksem
data = [10, 20, 30, 40]
series = pd.Series(data)
print(series)

# Tworzenie Series z własnym indeksem
index = ['a', 'b', 'c', 'd']
series_custom = pd.Series(data, index=index)
print(series_custom)

0    10
1    20
2    30
3    40
dtype: int64
a    10
b    20
c    30
d    40
dtype: int64


### Ćwiczenie: 
Stwórz własny obiekt Series z pięcioma wartościami (możesz użyć `range`) i niestandardowym indeksem.

In [9]:
x = pd.Series(range(5), index = ['q','w','e','t','y'])
print(x)


q    0
w    1
e    2
t    3
y    4
dtype: int64


## DataFrame - dwuwymiarowa tabela podobna do Excela 

In [12]:
data = {
    'Imię': ['Jan', 'Anna', 'Tomasz'],
    'Wiek': [25, 30, 35],
    'Miasto': ['Warszawa', 'Kraków', 'Gdańsk']
}

df = pd.DataFrame(data) # Łatwo tworzone ze słowników

# Różnica między print a display
print(df)
display(df)

# Wywoływanie indeksów i kolumn
print(df.columns)
print(df.index)

     Imię  Wiek    Miasto
0     Jan    25  Warszawa
1    Anna    30    Kraków
2  Tomasz    35    Gdańsk


Unnamed: 0,Imię,Wiek,Miasto
0,Jan,25,Warszawa
1,Anna,30,Kraków
2,Tomasz,35,Gdańsk


Index(['Imię', 'Wiek', 'Miasto'], dtype='object')
RangeIndex(start=0, stop=3, step=1)


In [13]:
data_bez_imie = {
    'Wiek': [25, 30, 35],
    'Miasto': ['Warszawa', 'Kraków', 'Gdańsk']
}

# Używanie indeksu innego niż domyślny
df = pd.DataFrame(data_bez_imie, index = ['Jan', 'Anna', 'Tomasz'])
display(df)
print(df.index)

Unnamed: 0,Wiek,Miasto
Jan,25,Warszawa
Anna,30,Kraków
Tomasz,35,Gdańsk


Index(['Jan', 'Anna', 'Tomasz'], dtype='object')


In [14]:
# Alternatywne ustawienie indeksu innego niż domyślny
df = pd.DataFrame(data).set_index('Imię')
display(df)
print(df.index)

# Powrót indeksu do kolumny
df = df.reset_index(drop = False)
display(df)

Unnamed: 0_level_0,Wiek,Miasto
Imię,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,25,Warszawa
Anna,30,Kraków
Tomasz,35,Gdańsk


Index(['Jan', 'Anna', 'Tomasz'], dtype='object', name='Imię')


Unnamed: 0,Imię,Wiek,Miasto
0,Jan,25,Warszawa
1,Anna,30,Kraków
2,Tomasz,35,Gdańsk


## Importowanie danych

In [15]:
# Importowanie danych z pliku CSV

# Jeżeli nie działa, podaj swoją lokalizację pliku
df_titanic = pd.read_csv('../input/titanic/train.csv') 

df_titanic.info()

display(df_titanic.describe()) # dla kolumn liczbowych

# Alternatywne wyjścia dla ustawienia indeksu
df_titanic = df_titanic.set_index('PassengerId')
df_titanic = pd.read_csv('../input/titanic/train.csv', index_col= "PassengerId") 

display(df_titanic.head())  # Podgląd pierwszych 5 wierszy
display(df_titanic.tail(10))  # Podgląd ostatnich 10 wierszy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [16]:
# Importowanie danych z pliku Excel
df_tdf = pd.read_excel('../input/tour_de_france/tour_de_france.xlsx', sheet_name='Stats', index_col='Year')
display(df_tdf.head())

Unnamed: 0_level_0,Winner's avg speed,Total distance (km),Number of stages,Finishers,Entrants,Winner,Winner's Nationality,Winner's Team,Start Date,End Date,Starting city,Starting city Latitude,Starting city Longitude,Starting country,Finishing city,Finishing city Latitude,Finishing city Longitude
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1903,25.68,2428.0,6,21,60,Maurice Garin,France,La Française,1903-07-01,1903-07-21,Paris,48.8566,2.3522,France,Paris,48.8566,2.3522
1904,25.27,2420.0,6,27,88,Henri Cornet,France,Cycles JC,1904-07-02,1904-07-24,Paris,48.8566,2.3522,France,Paris,48.8566,2.3522
1905,27.11,2994.0,11,24,60,Louis Trousselier,France,Cycles Peugeot,1905-07-09,1905-07-30,Paris,48.8566,2.3522,France,Paris,48.8566,2.3522
1906,24.46,4545.0,13,14,82,René Pottier,France,Cycles Peugeot,1906-07-04,1906-07-29,Paris,48.8566,2.3522,France,Paris,48.8566,2.3522
1907,28.47,4488.0,14,33,93,Lucien Petit-Breton,France,Cycles Peugeot,1907-07-08,1907-08-04,Paris,48.8566,2.3522,France,Paris,48.8566,2.3522


### Ćwiczenie:
1. Zaczytaj plik z lokacji `../input/titanic/test.csv` 
2. ustaw index `PassengerId`
3. Wyświetl pierwszych 10 wierszy

In [27]:
#df_titanic_test = pd.read_csv('../input/titanic/test.csv')


#df_titanic_test = df_titanic_test.set_index('PassengerId')

## Exportowanie danych

In [36]:
# Zapis to csv
df_titanic.to_csv('../output/titanic_kopia.csv', index=False)

# Różnica w uwzględnianiu indeksu
df_titanic.to_csv('../output/titanic_kopia2.csv', index=True)

# Zapis do xlsx
df_tdf.to_excel('../output/tour_de_france_kopia.xlsx', index=True, sheet_name="Nazwa")

OSError: Cannot save file into a non-existent directory: '..\output'

## Selekcja danych

In [37]:
# Pobieranie jednej kolumny
slice_series = df_titanic['Name']
display(slice_series)
print(type(slice_series))

# Pobieranie jednej kolumny jako DataFrame
slice_df = df_titanic[['Name']]
display(slice_df)  
print(type(slice_df))


PassengerId
1                                Braund, Mr. Owen Harris
2      Cumings, Mrs. John Bradley (Florence Briggs Th...
3                                 Heikkinen, Miss. Laina
4           Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                               Allen, Mr. William Henry
                             ...                        
887                                Montvila, Rev. Juozas
888                         Graham, Miss. Margaret Edith
889             Johnston, Miss. Catherine Helen "Carrie"
890                                Behr, Mr. Karl Howell
891                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

<class 'pandas.core.series.Series'>


Unnamed: 0_level_0,Name
PassengerId,Unnamed: 1_level_1
1,"Braund, Mr. Owen Harris"
2,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,"Heikkinen, Miss. Laina"
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
5,"Allen, Mr. William Henry"
...,...
887,"Montvila, Rev. Juozas"
888,"Graham, Miss. Margaret Edith"
889,"Johnston, Miss. Catherine Helen ""Carrie"""
890,"Behr, Mr. Karl Howell"


<class 'pandas.core.frame.DataFrame'>


In [38]:
# Pobranie kilku kolumn
slice_2 = df_titanic[['Name', 'Age']]
display(slice_2)  
print(type(slice_2))

Unnamed: 0_level_0,Name,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",22.0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Heikkinen, Miss. Laina",26.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
5,"Allen, Mr. William Henry",35.0
...,...,...
887,"Montvila, Rev. Juozas",27.0
888,"Graham, Miss. Margaret Edith",19.0
889,"Johnston, Miss. Catherine Helen ""Carrie""",
890,"Behr, Mr. Karl Howell",26.0


<class 'pandas.core.frame.DataFrame'>


In [39]:
# Selekcja wierszy
display(df_titanic[:5])  # Pierwsze 5 wierszy
display(df_titanic[-5:])  # Ostatnie 5 wierszy
display(df_titanic[100:105])  # Wiersze od 100 do 104

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S
102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.925,,S


## Selekcja danych `.loc` i `.iloc`

In [40]:
# Selekcja danych .loc (wybór według etykiet)
# Istotne są etykiety!!!
# display(df_titanic.loc[0, 'Name'])  # Nie działa!!!

display(df_titanic.loc[1, 'Name'])  # Pobranie wartości z pierwszego wiersza, kolumna 'Name'
display(df_titanic.loc[1:5, ['Name', 'Age']])  # Pobranie 6 pierwszych wierszy dla kolumn 'Name' i 'Age'
display(df_titanic.loc[:, ['Name', 'Age']])  # Pobranie wszystkich wierszy dla kolumn 'Name' i 'Age'

display(df_titanic.loc[10:20, :])  # Wiersze od 10 do 20, wszystkie kolumny
display(df_titanic.loc[::10, :])  # Co 10. wiersz

'Braund, Mr. Owen Harris'

Unnamed: 0_level_0,Name,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",22.0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Heikkinen, Miss. Laina",26.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
5,"Allen, Mr. William Henry",35.0


Unnamed: 0_level_0,Name,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",22.0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Heikkinen, Miss. Laina",26.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
5,"Allen, Mr. William Henry",35.0
...,...,...
887,"Montvila, Rev. Juozas",27.0
888,"Graham, Miss. Margaret Edith",19.0
889,"Johnston, Miss. Catherine Helen ""Carrie""",
890,"Behr, Mr. Karl Howell",26.0


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,,S
14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0000,,S
31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C
41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1,0,7546,9.4750,,S
...,...,...,...,...,...,...,...,...,...,...,...
851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.2750,,S
861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
871,0,3,"Balkic, Mr. Cerin",male,26.0,0,0,349248,7.8958,,S
881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S


In [41]:
# Selekcja danych .iloc (wybór według indeksów)
display(df_titanic.iloc[0, 1])  # Pobranie wartości z pierwszego wiersza, drugiej kolumny
display(df_titanic.iloc[:3, :2])  # Pierwsze 3 wiersze i 2 pierwsze kolumny
display(df_titanic.iloc[5:15, 1:-1])  # Wiersze od 5 do 14, kolumny od 1 do przedostatniej
display(df_titanic.iloc[::5, :])  # Co 5. wiersz, wszystkie kolumny

3

Unnamed: 0_level_0,Survived,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,3
2,1,1
3,1,3


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,
7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46
8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,
9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,
10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,
11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6
12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103
13,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,
14,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,
15,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...
871,0,3,"Balkic, Mr. Cerin",male,26.0,0,0,349248,7.8958,,S
876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q


### Ćwiczenie: 
Wybierz imię i wiek dziesiątej osoby z df_titanic za pomocą loc i iloc.

In [48]:
display(df_titanic.loc[10, ['Name', 'Age']]) 
display(df_titanic.iloc[9, [2,4]])

Name    Nasser, Mrs. Nicholas (Adele Achem)
Age                                    14.0
Name: 10, dtype: object

Name    Nasser, Mrs. Nicholas (Adele Achem)
Age                                    14.0
Name: 10, dtype: object

## Łączenie DataFrame'ów

In [50]:
# Merge - łączenie dwóch DataFrame'ów na podstawie wspólnej kolumny
employees = pd.DataFrame({'ID': [1, 2, 3, 4], 'Name': ['Jan', 'Anna', 'Tomasz', 'Jakub']})
display(employees)
salaries = pd.DataFrame({'ID': [1, 2, 3, 5], 'Salary': [4000, 5000, 6000, 7000]})
display(salaries)
merged_df = employees.merge(salaries, on='ID')
display(merged_df)


Unnamed: 0,ID,Name
0,1,Jan
1,2,Anna
2,3,Tomasz
3,4,Jakub


Unnamed: 0,ID,Salary
0,1,4000
1,2,5000
2,3,6000
3,5,7000


Unnamed: 0,ID,Name,Salary
0,1,Jan,4000
1,2,Anna,5000
2,3,Tomasz,6000


In [51]:
# Różnica między left, right, inner, outer
def merge_display(df1, df2, on, how):
    display(df1.merge(df2, on = on, how = how))

merge_display(employees, salaries, on = 'ID', how = 'left')
merge_display(employees, salaries, on = 'ID', how = 'right')
merge_display(employees, salaries, on = 'ID', how = 'inner')
merge_display(employees, salaries, on = 'ID', how = 'outer')

Unnamed: 0,ID,Name,Salary
0,1,Jan,4000.0
1,2,Anna,5000.0
2,3,Tomasz,6000.0
3,4,Jakub,


Unnamed: 0,ID,Name,Salary
0,1,Jan,4000
1,2,Anna,5000
2,3,Tomasz,6000
3,5,,7000


Unnamed: 0,ID,Name,Salary
0,1,Jan,4000
1,2,Anna,5000
2,3,Tomasz,6000


Unnamed: 0,ID,Name,Salary
0,1,Jan,4000.0
1,2,Anna,5000.0
2,3,Tomasz,6000.0
3,4,Jakub,
4,5,,7000.0


In [52]:
# Join - łączenie DataFrame'ów na podstawie indeksu
left = pd.DataFrame({'A': [1, 2, 3]}, index=['a', 'b', 'c'])
display(left)
right = pd.DataFrame({'B': [4, 5, 6]}, index=['a', 'b', 'c'])
display(right)

joined_df = left.join(right, how='inner')
display(joined_df)

Unnamed: 0,A
a,1
b,2
c,3


Unnamed: 0,B
a,4
b,5
c,6


Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [53]:
# Concat - łączenie DataFrame'ów wierszami
concat_df = pd.concat([left, right], axis=1)
display(concat_df)

# Concat - łączenie DataFrame'ów kolumnami
df_titanic_all = pd.concat([df_titanic, df_titanic_test], axis=0)
display(df_titanic_all)

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,
...,...,...,...,...,...,...,...,...,...,...,...,...
413,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1305.0
414,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1306.0
415,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1307.0
416,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1308.0


### Ćwiczenie: 
Połącz dwa DataFrame'y zawierające różne informacje o pasażerach Titanica.

In [59]:
x = df_titanic[['Survived', 'Name']]
y = df_titanic[['Age', 'Ticket']]

display(x.join(y))

Unnamed: 0_level_0,Survived,Name,Age,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,"Braund, Mr. Owen Harris",22.0,A/5 21171
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599
3,1,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803
5,0,"Allen, Mr. William Henry",35.0,373450
...,...,...,...,...
887,0,"Montvila, Rev. Juozas",27.0,211536
888,1,"Graham, Miss. Margaret Edith",19.0,112053
889,0,"Johnston, Miss. Catherine Helen ""Carrie""",,W./C. 6607
890,1,"Behr, Mr. Karl Howell",26.0,111369
