## load data from S3 with Pandas

In [1]:
import pandas

In [2]:
df = pandas.read_excel('s3://isat252-luker/Diamonds.xls')

df[:10] # returns the first 10 rows of the diamonds table

Unnamed: 0,IDNO,WEIGHT,COLOR,CLARITY,RATER,PRICE
0,1,0.3,D,VS2,GIA,1302
1,2,0.3,E,VS1,GIA,1510
2,3,0.3,G,VVS1,GIA,1510
3,4,0.3,G,VS1,GIA,1260
4,5,0.31,D,VS1,GIA,1641
5,6,0.31,E,VS1,GIA,1555
6,7,0.31,F,VS1,GIA,1427
7,8,0.31,G,VVS2,GIA,1427
8,9,0.31,H,VS2,GIA,1126
9,10,0.31,I,VS1,GIA,1126


In [3]:
df.describe() # a quick statistics summary of your data

Unnamed: 0,IDNO,WEIGHT,PRICE
count,308.0,308.0,308.0
mean,154.5,0.630909,5019.483766
std,89.056162,0.277183,3403.115715
min,1.0,0.18,638.0
25%,77.75,0.35,1625.0
50%,154.5,0.62,4215.0
75%,231.25,0.85,7446.0
max,308.0,1.1,16008.0


In [4]:
df['WEIGHT'] # selecting a single column 'dictionary'

0      0.30
1      0.30
2      0.30
3      0.30
4      0.31
5      0.31
6      0.31
7      0.31
8      0.31
9      0.31
10     0.32
11     0.32
12     0.33
13     0.33
14     0.34
15     0.34
16     0.34
17     0.34
18     0.34
19     0.34
20     0.35
21     0.35
22     0.35
23     0.35
24     0.36
25     0.36
26     0.37
27     0.37
28     0.40
29     0.40
       ... 
278    1.00
279    1.00
280    1.00
281    1.00
282    1.00
283    1.00
284    1.00
285    1.00
286    1.00
287    1.00
288    1.00
289    1.00
290    1.00
291    1.00
292    1.00
293    1.01
294    1.01
295    1.01
296    1.01
297    1.01
298    1.01
299    1.01
300    1.01
301    1.01
302    1.01
303    1.01
304    1.02
305    1.06
306    1.02
307    1.09
Name: WEIGHT, Length: 308, dtype: float64

In [5]:
df[1:5] #slice the rows 'list'

Unnamed: 0,IDNO,WEIGHT,COLOR,CLARITY,RATER,PRICE
1,2,0.3,E,VS1,GIA,1510
2,3,0.3,G,VVS1,GIA,1510
3,4,0.3,G,VS1,GIA,1260
4,5,0.31,D,VS1,GIA,1641


In [8]:
df.loc[df['COLOR'] == 'D' ] # selecting data by values

Unnamed: 0,IDNO,WEIGHT,COLOR,CLARITY,RATER,PRICE
0,1,0.3,D,VS2,GIA,1302
4,5,0.31,D,VS1,GIA,1641
42,43,0.52,D,VS2,GIA,3490
46,47,0.53,D,VS1,GIA,3921
94,95,0.71,D,VS1,GIA,6372
106,107,0.75,D,VVS2,GIA,7368
109,110,0.76,D,IF,GIA,9885
115,116,1.0,D,VVS1,GIA,15582
116,117,1.0,D,VS1,GIA,11419
130,131,1.01,D,VVS1,GIA,16008


In [11]:
df['RATER'].value_counts() # frequency of unique values

GIA    151
HRD     79
IGI     78
Name: RATER, dtype: int64

In [17]:
df['RATER'].count() # number of non_null values

308

In [15]:
df['PRICE'].max() # whole lot of statistics ex. max, min, standard dev, 
# mean, median, etc.

16008

In [19]:
df.groupby('RATER').max() # aggregation function

Unnamed: 0_level_0,IDNO,WEIGHT,COLOR,CLARITY,PRICE
RATER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GIA,151,1.1,I,VVS2,16008
HRD,308,1.09,I,VVS2,14051
IGI,229,1.01,I,VVS2,9713


In [21]:
max_price_per_rater = df.groupby('RATER').max()['PRICE']
max_price_per_rater

RATER
GIA    16008
HRD    14051
IGI     9713
Name: PRICE, dtype: int64

In [22]:
df['unit_price'] = df['PRICE']/df['WEIGHT'] # create new columns
df[:5]

Unnamed: 0,IDNO,WEIGHT,COLOR,CLARITY,RATER,PRICE,unit_price
0,1,0.3,D,VS2,GIA,1302,4340.0
1,2,0.3,E,VS1,GIA,1510,5033.333333
2,3,0.3,G,VVS1,GIA,1510,5033.333333
3,4,0.3,G,VS1,GIA,1260,4200.0
4,5,0.31,D,VS1,GIA,1641,5293.548387


## linear regression

In [23]:
from scipy import stats

In [25]:
result = stats.linregress(df['WEIGHT'],df['PRICE']) # build a SLR, x first, y second

In [28]:
print('slope is {}'.format(result.slope))
print('intercept is {}'.format(result.intercept))
print('r square is {}'.format(result.rvalue * result.rvalue))
print('p value is {}'.format(result.pvalue))
print('std error is {}'.format(result.stderr))

slope is 11598.884012882309
intercept is -2298.3576018937993
r square is 0.8925083858672289
p value is 3.0448096265906994e-150
std error is 230.1106037406023


In [29]:
print('p of {} w is ${}'.format(0.9,0.9*result.slope + result.intercept))

p of 0.9 w is $8140.638009700279


## sentiment analysis

In [30]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K    100% |████████████████████████████████| 645kB 19.4MB/s ta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [31]:
from textblob import TextBlob

In [39]:
result = TextBlob('I love python')

In [40]:
print('polarity {}'.format(result.sentiment.polarity))
print('subjectivity {}'.format(result.sentiment.subjectivity))

polarity 0.5
subjectivity 0.6
