# Pandas 基礎介紹

## 匯入資料
* https://raw.githubusercontent.com/Code-Gym/python-dataset/master/u.user.txt
* read_csv
* 分隔符號(sep)
* 索引欄位(index_col)

https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

In [1]:
import pandas as pd

users = pd.read_csv('https://raw.githubusercontent.com/Code-Gym/python-dataset/master/u.user.txt',
                    sep='|') # row labels of the DataFrame
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [2]:
import pandas as pd

users = pd.read_csv('https://raw.githubusercontent.com/Code-Gym/python-dataset/master/u.user.txt',
                    sep='|',
                    index_col=False) # index_col=False can be used to force pandas to not use the first column as the index
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [3]:
import pandas as pd

# 法一
users = pd.read_csv('https://raw.githubusercontent.com/Code-Gym/python-dataset/master/u.user.txt',
                   sep='|',
                   index_col='user_id')
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [4]:
import pandas as pd

# 法二
users = pd.read_csv('./u.user.txt', sep='|', index_col='user_id') # 相對路徑
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [5]:
import pandas as pd

# 法三
users = pd.read_csv(r"D:\Workspace\Anaconda-Workspace\Udemy\Python-Data-Analysis\CH4\3\u.user.txt",
                    sep='|',
                    index_col='user_id') # 絕對路徑
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [6]:
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


## 列印前五筆資料

In [7]:
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


## 列印後五筆資料

In [8]:
users.tail()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
939,26,F,student,33319
940,32,M,administrator,2215
941,20,M,student,97229
942,48,F,librarian,78209
943,22,M,student,77841


## 資料筆數

In [9]:
users.shape[0] # row

943

## 欄位數量

In [10]:
users.shape[1]  # column
                # user_id當index，因此欄位變成4

4

## 欄位名稱和資料型態

In [11]:
users.dtypes

age            int64
gender        object
occupation    object
zip_code      object
dtype: object

## 指定欄位

In [12]:
users['occupation']
# users.occupation

user_id
1         technician
2              other
3             writer
4         technician
5              other
           ...      
939          student
940    administrator
941          student
942        librarian
943          student
Name: occupation, Length: 943, dtype: object

## 列印職業欄中的第一筆資料

In [13]:
users.occupation[1] # 不是index。是第一筆資料。

'technician'

## 職業欄中，有多少不同種類的職業

In [14]:
users.occupation.nunique() # Return number of unique elements in the object.

21

## 每一種職業的統計數量有多少？

In [15]:
users.occupation.value_counts() # descending order(由大到小)

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
lawyer            12
salesman          12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

## 出現次數最高的職業是哪一種職業？

In [16]:
users.occupation.value_counts().head(1)

student    196
Name: occupation, dtype: int64

## 出現次數前五高的職業

In [17]:
users.occupation.value_counts().head(5)

student          196
other            105
educator          95
administrator     79
engineer          67
Name: occupation, dtype: int64

## 出現次數最少的是哪一種職業？

In [18]:
users.occupation.value_counts().tail(1)

doctor    7
Name: occupation, dtype: int64

## 職業出現次數最少的兩筆資料

In [19]:
users.occupation.value_counts().tail(2)

homemaker    7
doctor       7
Name: occupation, dtype: int64

## 基本統計資料

In [20]:
users.describe() # 只會計算資料型態為int的資料

Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


## 全部欄位基本統計資料

In [21]:
users.describe(include='all')

Unnamed: 0,age,gender,occupation,zip_code
count,943.0,943,943,943.0
unique,,2,21,795.0
top,,M,student,55414.0
freq,,670,196,9.0
mean,34.051962,,,
std,12.19274,,,
min,7.0,,,
25%,25.0,,,
50%,31.0,,,
75%,43.0,,,


![image.png](attachment:image.png)
由上述結果可知，此資料的平均年齡34歲、以男性居多、分別是從795個國家蒐集而來的。