# Pandas Library

In [1]:
#!pip install pandas

In [1]:
import numpy as np
import pandas as pd

## Creating Series
### from lists

In [6]:
mydata= [25, 43, 35, 44]
mydata

[25, 43, 35, 44]

In [8]:
type(mydata)

list

In [10]:
pd.Series(mydata)

0    25
1    43
2    35
3    44
dtype: int64

In [20]:
myindex=["elma", "armut", "erik", "visne"]

In [22]:
pd.Series(mydata, myindex)

elma     25
armut    43
erik     35
visne    44
dtype: int64

### from arrays

In [24]:
array=np.random.randint(0,100,5)
array

array([ 5, 80, 82,  2, 93])

In [26]:
names= ["Ahmet", "Mehmet", "Ayse", "Can", "Mert"]

In [32]:
myserie = pd.Series(array, names)
myserie

Ahmet      5
Mehmet    80
Ayse      82
Can        2
Mert      93
dtype: int32

In [34]:
arr=np.array([1,2])
ser=pd.Series(arr)
ser

0    1
1    2
dtype: int32

In [38]:
ser.iloc[0] = 90
ser

0    90
1     2
dtype: int32

In [40]:
arr

array([90,  2])

In [44]:
arr=np.array([1,2])
ser=pd.Series(arr, copy=True)
ser

0    1
1    2
dtype: int32

In [46]:
ser.iloc[0] = 90
ser

0    90
1     2
dtype: int32

In [48]:
arr

array([1, 2])

### from dictionaries

In [52]:
ages= {"Ayse": 15, "Can":12}
ages

{'Ayse': 15, 'Can': 12}

In [54]:
pd.Series(ages)

Ayse    15
Can     12
dtype: int64

In [58]:
pd.Series(ages, index=["Ayse", "b"])

Ayse    15.0
b        NaN
dtype: float64

## Series Operations

In [60]:
x = {"France":80, "Turkey":120, "Germany":200, "Italy":230}
y = {"Spain":90, "Turkey":135, "Germany":245,"Italy":210}

In [62]:
x_ser = pd.Series(x)
y_ser= pd.Series(y)

In [64]:
display(x_ser,y_ser)

France      80
Turkey     120
Germany    200
Italy      230
dtype: int64

Spain       90
Turkey     135
Germany    245
Italy      210
dtype: int64

In [66]:
x_ser["Turkey"]

120

In [68]:
y_ser["Turkey"]

135

In [70]:
x_ser.keys()

Index(['France', 'Turkey', 'Germany', 'Italy'], dtype='object')

In [72]:
x_ser * 2

France     160
Turkey     240
Germany    400
Italy      460
dtype: int64

In [74]:
x_ser / 10

France      8.0
Turkey     12.0
Germany    20.0
Italy      23.0
dtype: float64

In [77]:
x_ser + y_ser

France       NaN
Germany    445.0
Italy      440.0
Spain        NaN
Turkey     255.0
dtype: float64

In [81]:
x_ser.add(y_ser, fill_value=0)

France      80.0
Germany    445.0
Italy      440.0
Spain       90.0
Turkey     255.0
dtype: float64

## Creating a DataFrame

In [83]:
np.random.seed(101)
mydata = np.random.randint(0,101,(4,3))
mydata

array([[95, 11, 81],
       [70, 63, 87],
       [75,  9, 77],
       [40,  4, 63]])

In [87]:
pd.DataFrame(mydata)

Unnamed: 0,0,1,2
0,95,11,81
1,70,63,87
2,75,9,77
3,40,4,63


In [89]:
myindex = ["A", "B", "C", "D"]
mycolumns= ["X", "Y", "Z"]

In [91]:
pd.DataFrame(mydata,myindex,mycolumns)

Unnamed: 0,X,Y,Z
A,95,11,81
B,70,63,87
C,75,9,77
D,40,4,63


In [97]:
dic = {"col1": [1,2], "col2": [3,4]}
df= pd.DataFrame(dic, index=["a", "b"])
df

Unnamed: 0,col1,col2
a,1,3
b,2,4


In [99]:
ages

{'Ayse': 15, 'Can': 12}

In [101]:
df= pd.DataFrame(ages, index=["a", "b"])
df

Unnamed: 0,Ayse,Can
a,15,12
b,15,12


In [105]:
print("\n")





## Reading a CSV file for DataFrame

In [None]:
# df = pd.read_csv("C:\\Users\\myself\\files\\some_file.csv")
 
# df = pd.read_csv("..\\files\\some_file.csv")
 
# df = pd.read_csv(r"C:\Users\myself\files\some_file.csv")

In [107]:
df=pd.read_csv(r"C:\Users\oznur\OneDrive\Masaüstü\DataScience-20242\Datasets\tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


In [109]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person', 'Payer Name', 'CC Number', 'Payment ID'],
      dtype='object')

In [111]:
df.index

RangeIndex(start=0, stop=244, step=1)

In [125]:
df.head(1) #df.head(60)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959


In [129]:
df.tail(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 21.1+ KB


In [133]:
df.shape

(244, 11)

In [135]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


In [139]:
#df.describe().T
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


In [141]:
df.describe(include="object")

Unnamed: 0,sex,smoker,day,time,Payer Name,Payment ID
count,244,244,244,244,244,244
unique,2,2,4,2,244,243
top,Male,No,Sat,Dinner,Christy Cunningham,Thur8084
freq,157,151,87,176,1,2


In [143]:
len(df)

244

### Selection and Indexing

In [145]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608


In [147]:
df["day"]

0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: object

In [157]:
df[["day"]]

Unnamed: 0,day
0,Sun
1,Sun
2,Sun
3,Sun
4,Sun
...,...
239,Sat
240,Sat
241,Sat
242,Sat


In [149]:
type(df["day"])

pandas.core.series.Series

In [159]:
df[["total_bill","tip", "smoker"]]

Unnamed: 0,total_bill,tip,smoker
0,16.99,1.01,No
1,10.34,1.66,No
2,21.01,3.50,No
3,23.68,3.31,No
4,24.59,3.61,No
...,...,...,...
239,29.03,5.92,No
240,27.18,2.00,Yes
241,22.67,2.00,Yes
242,17.82,1.75,No


In [161]:
type(df[["total_bill","tip", "smoker"]])

pandas.core.frame.DataFrame

### Creating a New Column

### Removing or Deleting a Column

### Index Basics

#### set_index()

#### reset_index()

#### loc & iloc

### Removing a  Row

### Inserting a  Row

## Conditional Filtering

### isin()