In [2]:
import pandas as pd

* csv파일의 특정 열을 지정하여 데이터 프레임생성

In [5]:
movies = pd.read_csv("data/movies.csv", usecols=['Film','Year'])
movies.head()

Unnamed: 0,Film,Year
0,Zack and Miri Make a Porno,2008
1,Youth in Revolt,2010
2,You Will Meet a Tall Dark Stranger,2010
3,When in Rome,2010
4,What Happens in Vegas,2008


In [6]:
movies.shape

(77, 2)

In [7]:
movies_all = pd.read_csv("data/movies.csv")
movies_all.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


# 브로드캐스팅을 활용하여 특정 열 생성 미 초기화

In [9]:
# Audience score 와  Rotten Tomatoes 의 평가 점수의 평균값을 저장하는 열
movies['has_seen'] = 0
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,0
1,Youth in Revolt,2010,0
2,You Will Meet a Tall Dark Stranger,2010,0
3,When in Rome,2010,0
4,What Happens in Vegas,2008,0


In [60]:
movies['has_seen'] = (movies_all['Audience score %'] + movies_all['Rotten Tomatoes %']) // 2
movies.head()

Unnamed: 0,Film,Year,매출(원화),has_seen
0,Zack and Miri Make a Porno,2008,58044.96,67
1,Youth in Revolt,2010,27154.08,60
2,You Will Meet a Tall Dark Stranger,2010,36897.44,39
3,When in Rome,2010,59567.36,29
4,What Happens in Vegas,2008,303608.08,50


# 타입 변경

In [30]:
movies_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Film               77 non-null     object 
 1   Genre              77 non-null     object 
 2   Lead Studio        77 non-null     object 
 3   Audience score %   77 non-null     int64  
 4   Profitability      77 non-null     float64
 5   Rotten Tomatoes %  77 non-null     int64  
 6   Worldwide Gross    77 non-null     object 
 7   Year               77 non-null     int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 4.9+ KB


In [15]:
movies_all.head(1)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008


* 숫자로 변환가능한 문자열은 판다스에서 자동으로 숫자로 변환해 준다.  
* csv 파일은 아스키 파일이기 때문에 서식이 들어간 숫자 데이터를 문자열로 인식한다. 예) Worldwide Gross  
* 서식이 있는 숫자 데이터는 서식을 제거하고 별도로 형변환을 해주어야 한다. => 이러한 작업을 전처리(Preprocessing)이라고 한다.
* 열의 타입이 문자열인 경우. 열.str 속성의 문자열 함수를 사용할 수 있다.  

# 타입변경을 위한 기본 전처리

* 전처리(Preprocessing): 데이터 분석을 하기 위해서 원본 데이터를 분석 가능한 데이터로 조작하는 작업

In [17]:
movies_all['Worldwide Gross']

0      $41.94 
1      $19.62 
2      $26.66 
3      $43.04 
4     $219.37 
        ...   
72     $29.37 
73     $30.68 
74      $8.97 
75    $160.31 
76     $60.72 
Name: Worldwide Gross, Length: 77, dtype: object

In [19]:
rg_removed_dallar = movies_all['Worldwide Gross'].str.replace('$','') # $값을 제거, 하지만 아직도 형은 문자열이다.
rg_removed_dallar

0      41.94 
1      19.62 
2      26.66 
3      43.04 
4     219.37 
       ...   
72     29.37 
73     30.68 
74      8.97 
75    160.31 
76     60.72 
Name: Worldwide Gross, Length: 77, dtype: object

* astype: 형변환 함수

In [20]:
rg_removed_dallar.astype('float64')

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

* 메소드 파이프라인  
  앞의 메소드의 수행 결과를 연속적으로 사용하는 프로그래밍 방식

In [22]:
movies_all['Worldwide Gross'].str.replace('$','').astype('float64')

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

# 전처리한 결과를 특정 위치에 삽입

* insert(): 데이터프레임.insert(삽입열의 인덱스, 열이름, 값) => 함수 수행후 데이터 프레임의 값을 갱신한다.

* 열의 인덱스 확인

In [33]:
movies.columns.get_loc('has_seen')

2

In [26]:
movies.head(1)

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67


In [61]:
converted_gross = movies_all['Worldwide Gross'].str.replace('$','').astype('float64')
converted_gross

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

In [102]:
movies.insert(movies.columns.get_loc('has_seen') + 1, '매출(원화)', converted_gross * 1384)
movies.head()

Unnamed: 0,Film,Year,has_seen,매출(원화),매출(미화)
0,Zack and Miri Make a Porno,2008,67,58044.96,41.94
1,Youth in Revolt,2010,60,27154.08,19.62
2,You Will Meet a Tall Dark Stranger,2010,39,36897.44,26.66
3,When in Rome,2010,29,59567.36,43.04
4,What Happens in Vegas,2008,50,303608.08,219.37


* 열 이름으로 열 삭제

In [None]:
데이터프레임.drop( columns=[삭제할열 이름 리스트] )  # 삭제한 결과를 반영, 자기 자신은 변경 X
데이터프레임.drop( columns=[삭제할열 이름 리스트] , inplace =  True ) # 자기 자신도 변경

In [103]:
movies.drop(columns = ['매출(원화)'])

Unnamed: 0,Film,Year,has_seen,매출(미화)
0,Zack and Miri Make a Porno,2008,67,41.94
1,Youth in Revolt,2010,60,19.62
2,You Will Meet a Tall Dark Stranger,2010,39,26.66
3,When in Rome,2010,29,43.04
4,What Happens in Vegas,2008,50,219.37
...,...,...,...,...
72,Across the Universe,2007,69,29.37
73,A Serious Man,2009,76,30.68
74,A Dangerous Method,2011,84,8.97
75,27 Dresses,2008,55,160.31


In [104]:
movies.head()

Unnamed: 0,Film,Year,has_seen,매출(원화),매출(미화)
0,Zack and Miri Make a Porno,2008,67,58044.96,41.94
1,Youth in Revolt,2010,60,27154.08,19.62
2,You Will Meet a Tall Dark Stranger,2010,39,36897.44,26.66
3,When in Rome,2010,29,59567.36,43.04
4,What Happens in Vegas,2008,50,303608.08,219.37


In [110]:
try:
    movies.drop(columns=['매출(미화)'], inplace=True) # 삭제 후 또 실행하면 에러 발생
except:
    print('해당 열이 존재하지 않아 삭제할 수 없습니다.')
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67
1,Youth in Revolt,2010,60
2,You Will Meet a Tall Dark Stranger,2010,39
3,When in Rome,2010,29
4,What Happens in Vegas,2008,50


In [106]:
movies.insert(movies.columns.get_loc('Film')+1, '매출(원화)', converted_gross * 1384)
movies.head()

Unnamed: 0,Film,매출(원화),Year,has_seen,매출(미화)
0,Zack and Miri Make a Porno,58044.96,2008,67,41.94
1,Youth in Revolt,27154.08,2010,60,19.62
2,You Will Meet a Tall Dark Stranger,36897.44,2010,39,26.66
3,When in Rome,59567.36,2010,29,43.04
4,What Happens in Vegas,303608.08,2008,50,219.37


In [111]:
movies

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67
1,Youth in Revolt,2010,60
2,You Will Meet a Tall Dark Stranger,2010,39
3,When in Rome,2010,29
4,What Happens in Vegas,2008,50
...,...,...,...
72,Across the Universe,2007,69
73,A Serious Man,2009,76
74,A Dangerous Method,2011,84
75,27 Dresses,2008,55


* 문제 풀어봐요  
'매출(미화)' 열을 movies 데이터 프레임 제일 마지막 열에 추가하세요.

In [112]:
movies.insert(movies.columns.get_loc('has_seen')+1, '매출(미화)', converted_gross)
movies.head()

Unnamed: 0,Film,Year,has_seen,매출(미화)
0,Zack and Miri Make a Porno,2008,67,41.94
1,Youth in Revolt,2010,60,19.62
2,You Will Meet a Tall Dark Stranger,2010,39,26.66
3,When in Rome,2010,29,43.04
4,What Happens in Vegas,2008,50,219.37


In [113]:
try:
    movies.drop(columns=['매출(미화)'], inplace=True) # 삭제후 또 실행하면 에러 발생
except:
    print('해당 열이 존재하지 않아 삭제할 수 없습니다.')
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67
1,Youth in Revolt,2010,60
2,You Will Meet a Tall Dark Stranger,2010,39
3,When in Rome,2010,29
4,What Happens in Vegas,2008,50


# 특정 원소 변경

### 조회

In [47]:
movies.loc[74,'has_seen']

84

### 변경

In [48]:
movies.loc[74,'has_seen'] = 98
movies.loc[74,'has_seen']

98

# 데이터 삭제

### 데이터 프레임에서 삭제

In [50]:
movies_del = pd.read_csv('data/movies.csv')
movies_del.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


### 열삭제

In [51]:
movies_del.drop('Rotten Tomatoes %', axis=1).head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008


In [53]:
movies_del.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [54]:
movies_del.drop('Rotten Tomatoes %', axis=1).head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008


In [55]:
movies_del.drop('Rotten Tomatoes %', axis=1, inplace=True)
movies_del

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.090000,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.000000,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008
...,...,...,...,...,...,...,...
72,Across the Universe,romance,Independent,84,0.652603,$29.37,2007
73,A Serious Man,Drama,Universal,64,4.382857,$30.68,2009
74,A Dangerous Method,Drama,Independent,89,0.448645,$8.97,2011
75,27 Dresses,Comedy,Fox,71,5.343622,$160.31,2008


### 행삭제

In [57]:
# 행인덱스 2,3,4.의 멀티행 삭제
movies_del.drop([2,3,4], axis=0)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.090000,$19.62,2010
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,$22.18,2007
...,...,...,...,...,...,...,...
72,Across the Universe,romance,Independent,84,0.652603,$29.37,2007
73,A Serious Man,Drama,Universal,64,4.382857,$30.68,2009
74,A Dangerous Method,Drama,Independent,89,0.448645,$8.97,2011
75,27 Dresses,Comedy,Fox,71,5.343622,$160.31,2008


* 슬라이스 적용

In [59]:
# 행인덱스 10~20 행 데이터 삭제
movies_del.drop(index=movies.index[10:20], axis=0).head(20)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,$22.18,2007
8,Waiting For Forever,Romance,Independent,53,0.005,$0.03,2011
9,Valentine's Day,Comedy,Warner Bros.,54,4.184038,$217.57,2010
