### Join
### 一对一Join

In [84]:
import pandas as pd 
df1 = pd.DataFrame({'Employee':['Bob','Jake','Lisa','Tome','Creator'],
                   'Group':['Accounting','Engineering','ProjectManager','HumanResource','VicePredisent']})
df1

Unnamed: 0,Employee,Group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,ProjectManager
3,Tome,HumanResource
4,Creator,VicePredisent


In [85]:
df2 = pd.DataFrame({'Employee':['Bob','Jake','Lisa','Tome','Creator'],
                   'Salary':[2002,2012,2029,2013,3022]})
df2

Unnamed: 0,Employee,Salary
0,Bob,2002
1,Jake,2012
2,Lisa,2029
3,Tome,2013
4,Creator,3022


In [86]:
df3 = pd.merge(df1,df2)
df3

Unnamed: 0,Employee,Group,Salary
0,Bob,Accounting,2002
1,Jake,Engineering,2012
2,Lisa,ProjectManager,2029
3,Tome,HumanResource,2013
4,Creator,VicePredisent,3022


## 多对一Join

In [87]:
df4 = pd.DataFrame({'Group':['Engineering','ProjectManager','HumanResource'],
                   'Supervisor':['Calur','Heoms','Materin']})
df4

Unnamed: 0,Group,Supervisor
0,Engineering,Calur
1,ProjectManager,Heoms
2,HumanResource,Materin


In [88]:
df5 = pd.merge(df3,df4)
df5

Unnamed: 0,Employee,Group,Salary,Supervisor
0,Jake,Engineering,2012,Calur
1,Lisa,ProjectManager,2029,Heoms
2,Tome,HumanResource,2013,Materin


In [89]:
df5 = pd.merge(df4,df3)
df5

Unnamed: 0,Group,Supervisor,Employee,Salary
0,Engineering,Calur,Jake,2012
1,ProjectManager,Heoms,Lisa,2029
2,HumanResource,Materin,Tome,2013


### 多对多Join

In [90]:
df5 = pd.DataFrame({'Group':['Accounting', 'Accounting','Engineering',
                             'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 
                               'linux', 'spreadsheets', 'organization']})
df5

Unnamed: 0,Group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [91]:
pd.merge(df4,df5)

Unnamed: 0,Group,Supervisor,skills
0,Engineering,Calur,coding
1,Engineering,Calur,linux


### Merge 中的参数

## On

In [92]:
df1

Unnamed: 0,Employee,Group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,ProjectManager
3,Tome,HumanResource
4,Creator,VicePredisent


In [93]:
df2

Unnamed: 0,Employee,Salary
0,Bob,2002
1,Jake,2012
2,Lisa,2029
3,Tome,2013
4,Creator,3022


### 按照 Employee 来Merge

In [94]:
pd.merge(df1,df2,on = 'Employee')

Unnamed: 0,Employee,Group,Salary
0,Bob,Accounting,2002
1,Jake,Engineering,2012
2,Lisa,ProjectManager,2029
3,Tome,HumanResource,2013
4,Creator,VicePredisent,3022


### left_on and right_on 

In [95]:
df3 = pd.DataFrame({'name':['Bob','Jake','Lisa','Tome','Creator'],
             'salary':[70000,90000,122002,902933,827321]})
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,90000
2,Lisa,122002
3,Tome,902933
4,Creator,827321


In [96]:
pd.merge(df1,df3,left_on = 'Employee',right_on = 'name')

Unnamed: 0,Employee,Group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,90000
2,Lisa,ProjectManager,Lisa,122002
3,Tome,HumanResource,Tome,902933
4,Creator,VicePredisent,Creator,827321


### 去掉name 那一列！

In [24]:
pd.merge(df1,df3,left_on = 'Employee',right_on = 'name').drop('name',axis = 1)

Unnamed: 0,Employee,Group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,90000
2,Lisa,ProjectManager,122002
3,Tome,HumanResource,902933
4,Creator,VicePredisent,827321


### left_index and right_index 

In [103]:
df1a = df1.set_index('Employee') # 指定一个特定的index = 'Employee'
df1
df1a

Unnamed: 0_level_0,Group
Employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,ProjectManager
Tome,HumanResource
Creator,VicePredisent


In [104]:
df2

Unnamed: 0,Employee,Salary
0,Bob,2002
1,Jake,2012
2,Lisa,2029
3,Tome,2013
4,Creator,3022


In [106]:
df2a = df2.set_index('Employee')
df2a 

Unnamed: 0_level_0,Salary
Employee,Unnamed: 1_level_1
Bob,2002
Jake,2012
Lisa,2029
Tome,2013
Creator,3022


In [113]:
pd.merge(df1a,df2a,left_index = True, right_index = True)  #

Unnamed: 0_level_0,Group,Salary
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2002
Jake,Engineering,2012
Lisa,ProjectManager,2029
Tome,HumanResource,2013
Creator,VicePredisent,3022


In [109]:
df1a.join(df2a)

Unnamed: 0_level_0,Group,Salary
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2002
Jake,Engineering,2012
Lisa,ProjectManager,2029
Tome,HumanResource,2013
Creator,VicePredisent,3022


### 集合操作

In [116]:
df6 = pd.DataFrame({'name':['Pter','Pass','Nard'],
                   'food':['fish','beans','bread']},
                  columns  = ['food','name'])
df6

Unnamed: 0,food,name
0,fish,Pter
1,beans,Pass
2,bread,Nard


In [120]:
df7 = pd.DataFrame({'name':['Pter','Chae','Mard'],
                   'drink':['wine','beans','beer']},
                  columns  = ['name','drink'])
df7

Unnamed: 0,name,drink
0,Pter,wine
1,Chae,beans
2,Mard,beer


In [123]:
pd.merge(df6,df7)

Unnamed: 0,food,name,drink
0,fish,Pter,wine


In [124]:
pd.merge(df6,df7,how = 'inner')

Unnamed: 0,food,name,drink
0,fish,Pter,wine


In [125]:
pd.merge(df6,df7,how = 'outer')

Unnamed: 0,food,name,drink
0,fish,Pter,wine
1,beans,Pass,
2,bread,Nard,
3,,Chae,beans
4,,Mard,beer


In [133]:
pd.merge(df6,df7,how = 'left')

Unnamed: 0,food,name,drink
0,fish,Pter,wine
1,beans,Pass,
2,bread,Nard,


In [134]:
pd.merge(df6,df7,how = 'right')

Unnamed: 0,food,name,drink
0,fish,Pter,wine
1,,Chae,beans
2,,Mard,beer


In [128]:
df7

Unnamed: 0,name,drink
0,Pter,wine
1,Chae,beans
2,Mard,beer


### 处理相同的列

In [135]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df8

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


In [136]:
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
df9

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [137]:
pd.merge(df8, df9, on="name") # 自动加后缀


Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [139]:
pd.merge(df8, df9, on="name", suffixes=["_HELLO", "_WORLD"])

Unnamed: 0,name,rank_HELLO,rank_WORLD
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [9]:
from sklearn.datasets import load_boston 
boston = load_boston()
boston.keys()

['filename', 'data', 'target', 'DESCR', 'feature_names']

In [10]:
boston.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [18]:
print boston.data.shape

(506L, 13L)


In [21]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='|S7')

In [140]:
boston.filename

'c:\\python27\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'

In [27]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error,r2_score

In [29]:
print boston.DESCR

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [32]:
import pandas as pd
bos = pd.DataFrame(boston.data)
bos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [35]:
print boston.target

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [36]:
from sklearn.model_selection import train_test_split 

In [38]:
bos['PRICE'] = boston.target

In [39]:
X = bos.drop('PRICE',axis = 1)
Y = bos['PRICE']

In [63]:
X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size = 0.2, random_state = 5)
X_train.shape

(404, 13)

In [64]:
X_test.shape 

(102, 13)

In [141]:
Y_train.shape

(404L,)

In [143]:
Y_test.shape

(102L,)

In [146]:
lm = LinearRegression()
lm.fit(X_trian,y_train)
Y_trian_pred = lm.predict(X_trian)
Y_test_pred  = lm.predict(X_test)
lm.score(Y_test_pred,Y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[37.56311787 32.14445143 27.06573629  5.67080633 35.09982577  5.85803701
 27.53708506 31.81019188 26.35634771 22.77208748 31.91183048 21.50224061
 23.70119983 33.3622504  28.51633591 14.39456899  0.19284025 18.66247155
 13.71004139 14.13408635  2.03263952 19.7280831  38.18657429 24.19760058
 31.30247973 11.14144544 25.03636951 23.27970871 22.49420127 20.52972594
 15.16513744  6.92553586 18.3557733  22.37179804 28.91287973 19.02980786
 30.19357214  8.74384915 40.86691522 34.53763591 20.70224878  2.59618963
 29.99590282 12.15704798 27.10186397 30.8052437  -6.24169079 19.84885777
 20.92973441 12.43523958 20.4949947  19.19231742 23.69073157 12.67998473
 17.14252424 25.04649176 34.77758126 15.23294903 28.22306193 21.08745388
 20.39506129 25.79476888 14.72463673 33.18635032 23.17771307 13.11057248
 19.23154617 24.61162961 21.50327036 22.00419172 20.5900874  27.19709085
 16.86361523 18.92610238 20.62344917 25.73255665 22.03855586 14.51899949
 34.3918044  18.5369776  23.38945015 41.36132839 23.27134886 15.62340913
 25.69729854 17.16406313 18.5066679  10.04976469 18.99779955 17.02528993
 35.707325   17.50855206 22.16184894 19.26215663 24.16777784 27.80472748
 12.42828948 21.91295599 22.39477399 13.19335364 23.96991103 21.19914699].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [80]:
print lm.coef_

[-1.30799852e-01  4.94030235e-02  1.09535045e-03  2.70536624e+00
 -1.59570504e+01  3.41397332e+00  1.11887670e-03 -1.49308124e+00
  3.64422378e-01 -1.31718155e-02 -9.52369666e-01  1.17492092e-02
 -5.94076089e-01]


In [81]:
print lm.intercept_

37.912487009750485


In [68]:
df = pd.DataFrame(Y_test_pred,Y_test )
mse = mean_squared_error(Y_test,Y_test_pred)

In [75]:
import numpy as np
rmse = np.sqrt(mse)
print "rmse is ",rmse
import matplotlib.pyplot as plt
#plt.scatter(Y_test_pred, Y_test_pred - Y_test)

rmse is  4.568292042303172


In [73]:
from sklearn.ensemble import RandomForestRegressor 
RF = RandomForestRegressor()
RF.fit(X_train,Y_train)

preds_rf = RF.predict(X_test)
mse_rf = mean_squared_error(Y_test,preds_rf)
rmse_rf = np.sqrt(mse_rf)
rmse_rf

3.794238795755439

In [None]:
datasets = pd.read_csv()
X = datasets.iloc[:,[2,3]].values
y = datasets.iloc[:,4.values

In [82]:
from sklearn.metrics import confusion_matrix 
#cm = confusion_matrix(Y_test,Y_test_pred)