In [2]:
!python --version   # Python version

# About python:  https://www.python.org/
#                Python is powerful... and fast; plays well with others; runs everywhere; is friendly & easy to learn; 
#                is Open –> https://www.python.org/about/.
#     Python docs: https://docs.python.org/3/ (all documentation); 
#                  https://docs.python.org/3.7/ (Recommended version – 3.7). 
# The Python Tutorial (python3.7): https://docs.python.org/3.7/tutorial/index.html 

# Load Module ---
import numpy as np, pandas as pd
# NumPy : The fundamental package for scientific computing with Python. NumPy is the fundamental package for scientific 
#         computing in Python. It is a Python library that provides a multidimensional array object, various derived 
#         objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays, 
#         including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms,
#         basic linear algebra, basic statistical operations, random simulation and much more.
#     About: https://numpy.org/
#     Docs: https://numpy.org/doc/stable/
#     NumPy quickstart: https://numpy.org/doc/stable/user/quickstart.html

# Pandas: pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, 
#         built on top of the Python programming language.
#     About: https://pandas.pydata.org/ 
#     Docs: https://pandas.pydata.org/docs/ 
#     Getting started: https://pandas.pydata.org/docs/getting_started/index.html 
#     User Guide: https://pandas.pydata.org/docs/user_guide/index.html#user-guide 

print('numpy version:',np.__version__)
print('pandas version: ',pd.__version__)

Python 3.7.10
numpy version: 1.19.5
pandas version:  1.1.5


# Numpy and pandas warmup

In [3]:
# Genrate one dimension data - Series
one_d_data=np.random.rand(6) #  uniform distribution (in the range [0,1))
print('one_d_data:', one_d_data)
# pandas series
pd_series=pd.Series(data=one_d_data,index=None,dtype=None,name=None,copy=False)
pd_series

one_d_data: [0.1292109  0.15204455 0.33103917 0.56314728 0.17482357 0.4902248 ]


0    0.129211
1    0.152045
2    0.331039
3    0.563147
4    0.174824
5    0.490225
dtype: float64

In [4]:
# Genrate two dimension data - dataframe (rows and columns)
two_d_data=np.random.randn(11,6) # normal distribution (random variable with a Gaussian distribution)
print('two_d_data:\n',two_d_data,sep='')

two_d_data:
[[ 0.03024512  0.50511375  1.24899931  0.20928989 -0.47490977 -0.89977549]
 [ 2.26637449 -0.35148803  0.25534478 -0.72811915 -0.69917574  1.53179337]
 [ 0.41398662  0.53985547  0.42538685  0.87461099  1.2956849  -1.44956502]
 [ 1.03998284  1.264679    0.89577547  1.45059898 -0.40823682 -0.55174932]
 [-1.00260081 -0.53435822  0.59312139  2.23561696 -0.19932813  0.14625376]
 [-0.99450457 -0.99283831 -0.68039512 -1.08610794  0.50637957 -0.10168937]
 [ 0.82489574  1.11154294  1.23895392 -0.40004446 -0.78221198 -0.19062221]
 [ 1.67141209  0.80761513 -0.23664303  0.7821994   1.15189517 -0.0620839 ]
 [ 0.88567916 -0.06809755 -1.46609073  1.13948543 -1.09704302 -1.21928878]
 [-1.71483322 -1.75610579 -0.55671873  1.27918576 -0.94455069  1.53517421]
 [ 0.69872964  1.49396406  0.14250355  0.67135143 -1.14379916  1.05199055]]


In [29]:
# pandas dataframe 
pd_dataframe=pd.DataFrame(data=two_d_data,index=['a','b','c','d','e',1,2,3,4,5,6],columns=range(6),
                          dtype=None,copy=False)
pd_dataframe.head() # returns top five values

Unnamed: 0,0,1,2,3,4,5
a,0.030245,0.505114,1.248999,0.20929,-0.47491,-0.899775
b,2.266374,-0.351488,0.255345,-0.728119,-0.699176,1.531793
c,0.413987,0.539855,0.425387,0.874611,1.295685,-1.449565
d,1.039983,1.264679,0.895775,1.450599,-0.408237,-0.551749
e,-1.002601,-0.534358,0.593121,2.235617,-0.199328,0.146254


In [8]:
pd_dataframe.tail(3) # returns bottom three values

Unnamed: 0,0,1,2,3,4,5
4,0.885679,-0.068098,-1.466091,1.139485,-1.097043,-1.219289
5,-1.714833,-1.756106,-0.556719,1.279186,-0.944551,1.535174
6,0.69873,1.493964,0.142504,0.671351,-1.143799,1.051991


In [9]:
# Info
pd_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, a to 6
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       11 non-null     float64
 1   1       11 non-null     float64
 2   2       11 non-null     float64
 3   3       11 non-null     float64
 4   4       11 non-null     float64
 5   5       11 non-null     float64
dtypes: float64(6)
memory usage: 616.0+ bytes


In [13]:
# Stats
pd_dataframe.describe()

Unnamed: 0,0,1,2,3,4,5
count,11.0,11.0,11.0,11.0,11.0,11.0
mean,0.374488,0.183626,0.169113,0.58437,-0.254118,-0.019051
std,1.205375,1.018732,0.847312,1.001085,0.865352,1.028576
min,-1.714833,-1.756106,-1.466091,-1.086108,-1.143799,-1.449565
25%,-0.48213,-0.442923,-0.396681,-0.095377,-0.863381,-0.725762
50%,0.69873,0.505114,0.255345,0.782199,-0.47491,-0.101689
75%,0.962831,0.959579,0.744448,1.209336,0.153526,0.599122
max,2.266374,1.493964,1.248999,2.235617,1.295685,1.535174


In [15]:
# Null values 
pd_dataframe.isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
dtype: int64

In [17]:
# mathematical operation
sample_dataframe=pd.DataFrame(np.arange(0,15,1).reshape(5,3),columns=['a','b','c'])
sample_dataframe

Unnamed: 0,3,4,5
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


In [20]:
# opertion 
sample_dataframe+3  # not change original value 

Unnamed: 0,3,4,5
0,0.0,0.333333,0.666667
1,1.0,1.333333,1.666667
2,2.0,2.333333,2.666667
3,3.0,3.333333,3.666667
4,4.0,4.333333,4.666667


In [21]:
sample_dataframe+=3   # short-hand a=a+x -> a+=x 
sample_dataframe  # changed original value

Unnamed: 0,3,4,5
0,3,4,5
1,6,7,8
2,9,10,11
3,12,13,14
4,15,16,17


In [23]:
sample_dataframe%3==1 # where reminder is 1 (one)

Unnamed: 0,3,4,5
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False


In [34]:
# Saving file as csv 
pd_dataframe.to_csv('my_test_file',sep='$')
del pd_dataframe

In [38]:
# read file 
my_test_data=pd.read_csv('my_test_file',sep='$')
my_test_data.to_json()

# About Dataset – Stock Portfolio Performance

The dataset of performances of weighted scoring stock portfolios are obtained with mixture design from the US stock market historical database. 

Dataset source (UCI Machine Learning Repository):	 https://archive.ics.uci.edu/ml/datasets/Stock+portfolio+performance

Dataset download link: [stock portfolio performance data set.xlsx](https://archive.ics.uci.edu/ml/machine-learning-databases/00390/stock%20portfolio%20performance%20data%20set.xlsx) 


In [41]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00390/stock%20portfolio%20performance%20data%20set.xlsx

# Read dataset 
data_file_link_xlsx='https://archive.ics.uci.edu/ml/machine-learning-databases/00390/stock%20portfolio%20performance%20data%20set.xlsx'
data=pd.read_excel(data_file_link_xlsx,sheet_name='4th period',header=1)

In [42]:
data.head()

Unnamed: 0,ID,Large B/P,Large ROE,Large S/P,Large Return Rate in the last quarter,Large Market Value,Small systematic Risk,Annual Return,Excess Return,Systematic Risk,Total Risk,Abs. Win Rate,Rel. Win Rate,Annual Return.1,Excess Return.1,Systematic Risk.1,Total Risk.1,Abs. Win Rate.1,Rel. Win Rate.1
0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.019516,0.013399,1.902608,0.218617,0.6,0.4,0.488229,0.609445,0.780756,0.8,0.68,0.333333
1,2,0.0,1.0,0.0,0.0,0.0,0.0,0.023829,0.00641,1.263287,0.12874,0.55,0.65,0.505279,0.508169,0.443776,0.41422,0.56,0.666667
2,3,0.0,0.0,1.0,0.0,0.0,0.0,0.080282,0.026548,1.894339,0.208272,0.55,0.5,0.728484,0.8,0.776397,0.755594,0.56,0.466667
3,4,0.0,0.0,0.0,1.0,0.0,0.0,-0.006683,0.000728,1.425454,0.155526,0.55,0.35,0.38464,0.425836,0.529253,0.529196,0.56,0.266667
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.021999,0.004407,1.094579,0.106729,0.65,0.7,0.498046,0.479146,0.354852,0.319743,0.8,0.733333


In [43]:
data.shape  # rows , columns 

(63, 19)

In [44]:
column_dict={'ID':'id',
             ' Large B/P ':'bp_large',
             ' Large ROE ':'roe_large',
             ' Large S/P ':'sp_large',
             ' Large Return Rate in the last quarter ':'ror_large_last_quarter',
             ' Large Market Value ':'mv_large',
             ' Small systematic Risk':'systematic_risk_small',
             'Annual Return':'annual_return',
             'Excess Return':'excess_return',
             'Systematic Risk':'systemtic_risk_actual',
             'Total Risk':'total_risk',
             'Abs. Win Rate':'abs_win_rate',
             'Rel. Win Rate':'relative_win_rate',
             'Annual Return.1':'annual_return_norm',
             'Excess Return.1':'excess_return_norm',
             'Systematic Risk.1':'systemtic_risk_actual_norm',
             'Total Risk.1':'total_risk_norm',
             'Abs. Win Rate.1':'abs_win_rate_norm',
             'Rel. Win Rate.1':'relative_win_rate_norm'}

In [45]:
data.rename(index=None,columns=column_dict,axis=None,copy=True,inplace=True)
data.tail()

Unnamed: 0,id,bp_large,roe_large,sp_large,ror_large_last_quarter,mv_large,systematic_risk_small,annual_return,excess_return,systemtic_risk_actual,total_risk,abs_win_rate,relative_win_rate,annual_return_norm,excess_return_norm,systemtic_risk_actual_norm,total_risk_norm,abs_win_rate_norm,relative_win_rate_norm
58,59,0.2,0.2,0.2,0.0,0.2,0.2,0.034682,0.007396,1.010771,0.100845,0.55,0.65,0.548192,0.522462,0.310677,0.294491,0.56,0.666667
59,60,0.2,0.2,0.0,0.2,0.2,0.2,0.033733,0.006783,0.918444,0.092028,0.6,0.6,0.544438,0.51357,0.262013,0.256644,0.68,0.6
60,61,0.2,0.0,0.2,0.2,0.2,0.2,0.044852,0.010413,1.162878,0.118172,0.55,0.5,0.588401,0.566175,0.390852,0.368861,0.56,0.466667
61,62,0.0,0.2,0.2,0.2,0.2,0.2,0.040456,0.008777,0.998392,0.100601,0.6,0.65,0.571021,0.542471,0.304153,0.293441,0.68,0.666667
62,63,0.167,0.167,0.167,0.167,0.167,0.167,0.05751,0.012866,1.048489,0.10497,0.55,0.75,0.638448,0.601735,0.330558,0.312193,0.56,0.8


### basic 

In [46]:
# info 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          63 non-null     int64  
 1   bp_large                    63 non-null     float64
 2   roe_large                   63 non-null     float64
 3   sp_large                    63 non-null     float64
 4   ror_large_last_quarter      63 non-null     float64
 5   mv_large                    63 non-null     float64
 6   systematic_risk_small       63 non-null     float64
 7   annual_return               63 non-null     float64
 8   excess_return               63 non-null     float64
 9   systemtic_risk_actual       63 non-null     float64
 10  total_risk                  63 non-null     float64
 11  abs_win_rate                63 non-null     float64
 12  relative_win_rate           63 non-null     float64
 13  annual_return_norm          63 non-nu

In [47]:
# null values 
data.isna().sum()

id                            0
bp_large                      0
roe_large                     0
sp_large                      0
ror_large_last_quarter        0
mv_large                      0
systematic_risk_small         0
annual_return                 0
excess_return                 0
systemtic_risk_actual         0
total_risk                    0
abs_win_rate                  0
relative_win_rate             0
annual_return_norm            0
excess_return_norm            0
systemtic_risk_actual_norm    0
total_risk_norm               0
abs_win_rate_norm             0
relative_win_rate_norm        0
dtype: int64

In [51]:
# stats - Transpose
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,63.0,32.0,18.330303,1.0,16.5,32.0,47.5,63.0
bp_large,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
roe_large,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
sp_large,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
ror_large_last_quarter,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
mv_large,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
systematic_risk_small,63.0,0.166619,0.199304,0.0,0.0,0.167,0.2915,1.0
annual_return,63.0,0.040384,0.028337,-0.053382,0.021405,0.042629,0.061776,0.098369
excess_return,63.0,0.010196,0.007972,-0.014856,0.004378,0.010413,0.01584,0.026548
systemtic_risk_actual,63.0,1.206636,0.271843,0.800792,0.997674,1.181784,1.363218,1.939118


### sort 

In [52]:
# sort columns by names
data.reindex(sorted(data.columns,reverse=False),axis='columns').head(3) # or axis=0; inplace -> X
# pandas.DataFrame.sort_index -> https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.htm

Unnamed: 0,abs_win_rate,abs_win_rate_norm,annual_return,annual_return_norm,bp_large,excess_return,excess_return_norm,id,mv_large,relative_win_rate,relative_win_rate_norm,roe_large,ror_large_last_quarter,sp_large,systematic_risk_small,systemtic_risk_actual,systemtic_risk_actual_norm,total_risk,total_risk_norm
0,0.6,0.68,0.019516,0.488229,1.0,0.013399,0.609445,1,0.0,0.4,0.333333,0.0,0.0,0.0,0.0,1.902608,0.780756,0.218617,0.8
1,0.55,0.56,0.023829,0.505279,0.0,0.00641,0.508169,2,0.0,0.65,0.666667,1.0,0.0,0.0,0.0,1.263287,0.443776,0.12874,0.41422
2,0.55,0.56,0.080282,0.728484,0.0,0.026548,0.8,3,0.0,0.5,0.466667,0.0,0.0,1.0,0.0,1.894339,0.776397,0.208272,0.755594


In [53]:
# sort by column 
data.sort_values(['bp_large','roe_large'],axis=0,ascending=True,inplace=False,kind='quicksort',na_position='last').tail(7)
# for more look - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

Unnamed: 0,id,bp_large,roe_large,sp_large,ror_large_last_quarter,mv_large,systematic_risk_small,annual_return,excess_return,systemtic_risk_actual,total_risk,abs_win_rate,relative_win_rate,annual_return_norm,excess_return_norm,systemtic_risk_actual_norm,total_risk_norm,abs_win_rate_norm,relative_win_rate_norm
31,32,0.333,0.333,0.0,0.0,0.0,0.333,0.016851,0.003217,0.911258,0.096577,0.65,0.45,0.477689,0.461894,0.258225,0.276172,0.8,0.4
7,8,0.5,0.0,0.5,0.0,0.0,0.0,0.0617,0.02337,1.939118,0.218238,0.6,0.5,0.655017,0.753942,0.8,0.798371,0.68,0.466667
9,10,0.5,0.0,0.0,0.5,0.0,0.0,0.068515,0.019034,1.524957,0.162835,0.55,0.4,0.68196,0.691107,0.5817,0.560565,0.56,0.333333
12,13,0.5,0.0,0.0,0.0,0.5,0.0,0.025587,0.007369,1.420759,0.145995,0.55,0.5,0.512232,0.522063,0.526778,0.488286,0.56,0.466667
16,17,0.5,0.0,0.0,0.0,0.0,0.5,-0.023439,-0.006177,0.977325,0.106924,0.65,0.45,0.318389,0.325766,0.293048,0.320582,0.8,0.4
6,7,0.5,0.5,0.0,0.0,0.0,0.0,0.061851,0.016324,1.392192,0.144132,0.55,0.65,0.655613,0.65184,0.511721,0.480288,0.56,0.666667
0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.019516,0.013399,1.902608,0.218617,0.6,0.4,0.488229,0.609445,0.780756,0.8,0.68,0.333333


In [None]:
# sort by row
data.sort_values([0,2],axis=1).head(3)

### convert 

In [None]:
# to locate all column values as numpy array
data.bp_large.to_numpy()    # or to_json or to_csv , also can save values
#data.bp_large.values

In [None]:
# save as csv
data.describe().to_csv('data_stats.csv')

### locating values 

In [None]:
# stats - Transpose
data.describe().loc[['count','mean','std','min','max'],data.columns[1:]]   # locate values by names (rows, columns) 

In [None]:
# to locate row
data.loc[1]

In [None]:
# to locate all column values
data.bp_large  # or data['bp_large'] 

In [None]:
# for single value 

# method 1
print('method 1 -')
%timeit data.loc[7,'bp_large']

# method 2
print('method 2 -')
%timeit data.bp_large[7] # data.bp_large -> pandas.series

# method 2.2
print('method 3 -')
%timeit data.bp_large.loc[7]

# method 3 - fast 
print('method 4 -')
%timeit data.at[7,'bp_large']

print('value:',data.at[7,'bp_large'])

### Boolean indexing (locating)

In [None]:
# all negative values in dataset
data[data<0]

In [None]:
data[data.annual_return<0]    # negtaive annual return, column wise, all values

In [None]:
data.id[data.annual_return<0]   # id of stocks with negtaive annual return 

In [None]:
data.loc[3,data.loc[3]<0] # get row wise value, single 

In [None]:
# check for multiple columns 
data[data.loc[:,['sp_large','bp_large','roe_large']]<0.5].loc[:,['sp_large','bp_large','roe_large']]

In [None]:
data[data.iloc[3:7]<0].iloc[3:7].dropna(axis=0,how='all') # axis : {0 or 'index', 1 or 'columns'}, default 0

In [None]:
data[data.iloc[3:7]<0].iloc[3:7].dropna(axis=0,how='all').dropna(axis=1,how='all') # more filtering 

### pd.dataframe.where ?

In [None]:
data.where(data.iloc[3:7]<0,other=np.nan,inplace=False,axis=None) # check particular condition 

In [None]:
data.where(data.iloc[3:7]<0,other=np.nan,inplace=False,axis=None).dropna(axis=0,how='all') # check particular condition 

In [None]:
data.iloc[3:7].where(data.iloc[3:7]<0,other=np.nan,inplace=False,axis=None) # check particular condition , sub-datsset

In [None]:
#data.loc[data.iloc[3:7].where(data.iloc[3:7]<0,other=np.nan,inplace=False,axis=None).index] # see all values of particular condition
# or 
data.iloc[3:7].where(data.iloc[3:7]<0,other=data+9,inplace=False,axis=None)

### drop rows or columns 

In [None]:
data.drop(labels=None,axis=0,index=None,columns=data.columns[-6:],inplace=False).head(4)  # all normalized columns 

In [None]:
data.drop(labels=None,axis=0,index=data.index[5:],columns=data.columns[-6:],inplace=False).head(4)  # both columns and rows 

### pd.dataframe.isin() # find multiple values 

In [None]:
data[data.isin([0.333,0.250])].dropna(how='all').dropna(axis=1,how='all').head()

In [None]:
data[data.isin([0.333,0.250])].dropna(how='all').dropna(axis=1,how='all').tail()

### stats 

In [None]:
# mean 
data.mean(axis=None,skipna=True,level=None,numeric_only=True) # get mean for numeric data only 

In [None]:
# median 
data.median(axis=1,skipna=None,level=None,numeric_only=None)

In [None]:
# max, min
data.loc[[3,9,23],['bp_large','roe_large']].max() # row wise 

In [None]:
# max, min
data.loc[:,['bp_large','roe_large']].T.min() # column wise

### Setting or assigning values

In [None]:
data_small=data.drop(columns=data.columns[-12:]) # return copy 

In [None]:
# adding column 
data_small['mean_value']=data_small[['bp_large','roe_large','sp_large']].mean(axis=1)

In [None]:
data_small.head()

In [None]:
# adding row 
data_small.loc['median_value']=data_small.median(axis=0)

In [None]:
data_small.tail()

### pd.dataframe.apply

In [None]:
# apply pre-define function
data_small.apply(np.log).head()     # all values 

In [None]:
# user define function 
def my_magic_function(value,custom_value):
  if round(value,3)>0.134:
    return custom_value
  else: return value

# apply 
data_small.mean_value.apply(my_magic_function,convert_dtype=True,args=(-1,)).tail()

### data normalization 

In [55]:
# import function 
from sklearn.preprocessing import normalize
# doc -> https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer
# axis -> # 0:column; 1:rows 
# l1 -> value/sum(respective row or column) --> norm_value -> sum axis wise
# l2 -> value/sqrt(sum of square of each element wise of respective row or column))  --> norm_value -> square_root(sum(axis wise element square))
# 'max' -> value/max(respective row or column) --> norm_value -> maximum value of axis 

In [56]:
# Smaple example ---
sample_matrix=np.arange(0,9).reshape(3,3)
print(sample_matrix,'\n','--x--'*30,sep='')
norm_sample_matrix,norms=normalize(sample_matrix,norm='l2',axis=0,copy=True,return_norm=True)
print(norms)
print('--x--'*30)
norm_sample_matrix

[[0 1 2]
 [3 4 5]
 [6 7 8]]
--x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x--
[6.70820393 8.1240384  9.64365076]
--x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x----x--


array([[0.        , 0.12309149, 0.20739034],
       [0.4472136 , 0.49236596, 0.51847585],
       [0.89442719, 0.86164044, 0.82956136]])

In [62]:
data_small=data[data.columns[1:7]]
data_small

Unnamed: 0,bp_large,roe_large,sp_large,ror_large_last_quarter,mv_large,systematic_risk_small
0,1.000,0.000,0.000,0.000,0.000,0.000
1,0.000,1.000,0.000,0.000,0.000,0.000
2,0.000,0.000,1.000,0.000,0.000,0.000
3,0.000,0.000,0.000,1.000,0.000,0.000
4,0.000,0.000,0.000,0.000,1.000,0.000
...,...,...,...,...,...,...
58,0.200,0.200,0.200,0.000,0.200,0.200
59,0.200,0.200,0.000,0.200,0.200,0.200
60,0.200,0.000,0.200,0.200,0.200,0.200
61,0.000,0.200,0.200,0.200,0.200,0.200


In [63]:
# Method 1 - direct 
data_small_norm_matrix,_=normalize(data_small[data_small.columns[1:]].values,norm='l2',axis=0,copy=True,return_norm=True)
data_small_norm=pd.DataFrame(data=data_small_norm_matrix,index=None,columns=data_small.columns[1:],
                             dtype=None,copy=True)
#data_small_norm['id']=data_small.id
data_small_norm.head()

Unnamed: 0,roe_large,sp_large,ror_large_last_quarter,mv_large,systematic_risk_small
0,0.0,0.0,0.0,0.0,0.0
1,0.487267,0.0,0.0,0.0,0.0
2,0.0,0.487267,0.0,0.0,0.0
3,0.0,0.0,0.487267,0.0,0.0
4,0.0,0.0,0.0,0.487267,0.0


In [64]:
# data x
x=data_small_norm.to_numpy()

In [65]:
# label y
y=data.annual_return_norm.to_numpy()

In [69]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1)

In [71]:
from sklearn.linear_model import LinearRegression

reg=LinearRegression()

In [72]:
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [74]:
y_pred=reg.predict(x_test)

In [77]:
y_pred,y_test

(array([0.55046766, 0.48970155, 0.64257627, 0.64889429, 0.43820467,
        0.60185568, 0.57948087]),
 array([0.55911677, 0.5332737 , 0.64021539, 0.7229442 , 0.43566399,
        0.57961118, 0.60937582]))

In [78]:
sum((y_pred-y_test)**2)

0.008857282211017461

In [None]:
# re-arrange columns 
re_columns=list(data_small_norm.columns)
# -Method -1 
re_columns=re_columns[-1:]+re_columns[:-1]
# -Method -2
#re_columns.insert(0,re_columns.pop())
print(re_columns)
data_small_norm=data_small_norm[re_columns]
data_small_norm.head()