# Chapter 3: Beginning Data Analysis

## Recipes
* [Developing a data analysis routine](#Developing-a-data-analysis-routine)
* [Reducing memory by changing data types](#Reducing-memory-by-changing-data-types)
* [Selecting the smallest of the largest](#Selecting-the-smallest-of-the-largest)
* [Selecting the largest of each group by sorting](#Selecting-the-largest-of-each-group-by-sorting)
* [Replicating nlargest with sort_values](#Replicating-nlargest-with-sort_values)
* [Calculating a trailing stop order price](#Calculating-a-trailing-stop-order-price)

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns = 50

# Developing a data analysis routine

In [2]:
college = pd.read_csv('../notebooks/data/college.csv')

In [3]:
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [4]:
college.shape

(7535, 27)

In [5]:
with pd.option_context('display.max_rows', 8):
    display(college.describe(include=[np.number]).T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.00000,0.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0000,0.00000,0.000000,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...
CURROPER,7535.0,0.923291,0.266146,0.0,1.0000,1.00000,1.000000,1.0
PCTPELL,6849.0,0.530643,0.225544,0.0,0.3578,0.52150,0.712900,1.0
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.3329,0.58330,0.745000,1.0
UG25ABV,6718.0,0.410021,0.228939,0.0,0.2415,0.40075,0.572275,1.0


In [7]:
college.describe(include=['object', pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama A & M University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [8]:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [9]:
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [11]:
college.describe(include=['object', pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama A & M University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


## There's more...

In [12]:
with pd.option_context('display.max_rows', 5):
    display(college.describe(include=[np.number], 
                 percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T)

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,1.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.0000,0.0000,0.0000,0.3329,0.58330,0.745000,0.84752,0.89792,0.986368,1.0
UG25ABV,6718.0,0.410021,0.228939,0.0,0.0025,0.0374,0.0899,0.2415,0.40075,0.572275,0.72666,0.80000,0.917383,1.0


In [14]:
college_dd = pd.read_csv('../notebooks/data/college_data_dictionary.csv')

In [15]:
with pd.option_context('display.max_rows', 8):
    display(college_dd)

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
...,...,...
23,PCTFLOAN,Percent Students with federal loan
24,UG25ABV,Percent Students Older than 25
25,MD_EARN_WNE_P10,Median Earnings 10 years after enrollment
26,GRAD_DEBT_MDN_SUPP,Median debt of completers


# Reducing memory by changing data types

In [16]:
college = pd.read_csv('../notebooks/data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [17]:
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [18]:
original_mem = col2.memory_usage(deep=True)
original_mem

Index          132
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      599848
STABBR      384285
dtype: int64

In [19]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)

In [20]:
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [21]:
col2.select_dtypes(include=['object']).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [22]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [23]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          132
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      600307
STABBR       12648
dtype: int64

In [24]:
new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000765
STABBR      0.032913
dtype: float64

## There's more...

In [27]:
college = pd.read_csv('../notebooks/data/college.csv')

In [28]:
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

Index          132
CURROPER     60280
INSTNM      599848
dtype: int64

In [29]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
# college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

Index          132
CURROPER     60280
INSTNM      599849
dtype: int64

In [30]:
college['MENONLY'].dtype

dtype('float64')

In [32]:
college['MENONLY'].astype('Int8')  # Note the capital 'I'# ValueError: Cannot convert non-finite values (NA or inf) to integer

0          0
1          0
2          0
3          0
4          0
        ... 
7530    <NA>
7531    <NA>
7532    <NA>
7533    <NA>
7534    <NA>
Name: MENONLY, Length: 7535, dtype: Int8

In [33]:
college.describe(include=['int64', 'float64']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [34]:
college.describe(include=[np.int64, np.float64]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [35]:
college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)

In [36]:
college.describe(include=['int', 'float']).T  # defaults to 64 bit int/floats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0
UGDS_HISP,6874.0,0.161635,0.221854,0.0,0.0276,0.0714,0.198875,1.0


In [37]:
college.describe(include=['number']).T  # also works as the default int/float are 64 bits

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [38]:
college['MENONLY'] = college['MENONLY'].astype('float16')
college['RELAFFIL'] = college['RELAFFIL'].astype('int8')

In [40]:
college.index = pd.Index(college.index, dtype='int64')
college.index.memory_usage()

132

# Selecting the smallest of the largest

In [41]:
movie = pd.read_csv('../notebooks/data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens ...,7.1,


In [42]:
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2765,Towering Inferno,9.5,
1937,The Shawshank Redemption,9.3,25000000.0
3466,The Godfather,9.2,6000000.0
2824,Dekalog,9.1,
3207,Dekalog,9.1,


In [43]:
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


# Selecting the largest of each group by sorting

In [44]:
movie = pd.read_csv('../notebooks/data/movie.csv')
movie2 = movie[['movie_title', 'title_year', 'imdb_score']]

In [45]:
movie2.sort_values('title_year', ascending=False).head()

Unnamed: 0,movie_title,title_year,imdb_score
2366,Fight Valley,2016.0,5.0
3817,Yoga Hosers,2016.0,4.8
1367,The 5th Wave,2016.0,5.2
1742,The Boss,2016.0,5.3
519,The Secret Life of Pets,2016.0,6.8


In [46]:
movie3 = movie2.sort_values(['title_year','imdb_score'], ascending=False)
movie3.head()

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
4372,A Beginner's Guide to Snuff,2016.0,8.7
3870,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2


In [47]:
movie_top_year = movie3.drop_duplicates(subset='title_year')
movie_top_year.head()

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
3816,Running Forever,2015.0,8.6
4468,Queen of the Mountains,2014.0,8.7
4017,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5


In [48]:
movie4 = movie[['movie_title', 'title_year', 'content_rating', 'budget']]
movie4_sorted = movie4.sort_values(['title_year', 'content_rating', 'budget'], 
                                   ascending=[False, False, True])
movie4_sorted.drop_duplicates(subset=['title_year', 'content_rating']).head(10)

Unnamed: 0,movie_title,title_year,content_rating,budget
4108,Compadres,2016.0,R,3000000.0
4772,Fight to the Finish,2016.0,PG-13,150000.0
4775,Rodeo Girl,2016.0,PG,500000.0
3309,The Wailing,2016.0,Not Rated,
4773,Alleluia! The Devil's Carnival,2016.0,,500000.0
4848,Bizarre,2015.0,Unrated,500000.0
821,The Ridiculous 6,2015.0,TV-14,
4956,The Gallows,2015.0,R,100000.0
4948,Romantic Schemer,2015.0,PG-13,125000.0
3868,R.L. Stine's Monsterville: The Cabinet of Souls,2015.0,PG,4400000.0


# Replicating nlargest with sort_values

In [49]:
movie = pd.read_csv('../notebooks/data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie_smallest_largest = movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')
movie_smallest_largest

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


In [50]:
movie2.sort_values('imdb_score', ascending=False).head(100).head()

Unnamed: 0,movie_title,imdb_score,budget
2765,Towering Inferno,9.5,
1937,The Shawshank Redemption,9.3,25000000.0
3466,The Godfather,9.2,6000000.0
4409,Kickboxer: Vengeance,9.1,17000000.0
2824,Dekalog,9.1,


In [51]:
movie2.sort_values('imdb_score', ascending=False).head(100).sort_values('budget').head()

Unnamed: 0,movie_title,imdb_score,budget
4937,A Charlie Brown Christmas,8.4,150000.0
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0


In [52]:
movie2.nlargest(100, 'imdb_score').tail()

Unnamed: 0,movie_title,imdb_score,budget
4017,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4105,Oldboy,8.4,3000000.0
4253,To Kill a Mockingbird,8.4,2000000.0
4496,Reservoir Dogs,8.4,1200000.0
4659,A Separation,8.4,500000.0


In [53]:
movie2.sort_values('imdb_score', ascending=False).head(100).tail()

Unnamed: 0,movie_title,imdb_score,budget
3849,Requiem for a Dream,8.4,4500000.0
3623,Veronica Mars,8.4,
3685,Rang De Basanti,8.4,
2362,The Shining,8.4,19000000.0
3807,Psych,8.4,


# Calculating a trailing stop order price

In [55]:
import pandas_datareader as pdr

### Note: pandas_datareader issues
pandas_datareader can have issues when the source is 'google'. It can also read from Yahoo! finance. Try switching it to 'yahoo'

In [60]:
# Alternative approach using yfinance
# First install: pip install yfinance
import yfinance as yf

# Get Tesla stock data for the past year
tsla = yf.download('TSLA', period='1y')
tsla.head(8)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,TSLA,TSLA,TSLA,TSLA,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-03-13,169.479996,176.050003,169.149994,173.050003,106524500
2024-03-14,162.5,171.169998,160.509995,167.770004,126325700
2024-03-15,163.570007,165.179993,160.759995,163.160004,96971900
2024-03-18,173.800003,174.720001,165.899994,170.020004,108214400
2024-03-19,171.320007,172.820007,167.419998,172.360001,77271400
2024-03-20,175.660004,176.25,170.820007,173.0,83846700
2024-03-21,172.820007,178.179993,171.800003,176.389999,73178000
2024-03-22,170.830002,171.199997,166.300003,166.690002,75454700


In [61]:
tsla_close = tsla['Close']

In [62]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head(8)

Ticker,TSLA
Date,Unnamed: 1_level_1
2024-03-13,169.479996
2024-03-14,169.479996
2024-03-15,169.479996
2024-03-18,173.800003
2024-03-19,173.800003
2024-03-20,175.660004
2024-03-21,175.660004
2024-03-22,175.660004


In [63]:
tsla_trailing_stop = tsla_cummax * .9
tsla_trailing_stop.head(8)

Ticker,TSLA
Date,Unnamed: 1_level_1
2024-03-13,152.531996
2024-03-14,152.531996
2024-03-15,152.531996
2024-03-18,156.420003
2024-03-19,156.420003
2024-03-20,158.094003
2024-03-21,158.094003
2024-03-22,158.094003


## There's more...

In [66]:
def set_trailing_loss(symbol, purchase_date, perc):
    """
    Calculate a trailing stop loss for a given stock symbol.
    
    Parameters:
    -----------
    symbol : str
        The stock ticker symbol (e.g., 'MSFT' for Microsoft)
    purchase_date : str
        The date from which to start tracking, in 'YYYY-MM-DD' format
    perc : float
        The percentage (as a decimal) of the maximum price to set as the stop loss
        
    Returns:
    --------
    pandas.Series
        A series containing the trailing stop loss values
    """
    import yfinance as yf
    from datetime import datetime
    
    # Get data from purchase date to today
    end_date = datetime.now().strftime('%Y-%m-%d')
    data = yf.download(symbol, start=purchase_date, end=end_date)
    
    # Calculate the trailing stop based on the cumulative maximum close price
    return data['Close'].cummax() * perc

# Example usage
msft_trailing_stop = set_trailing_loss('MSFT', '2017-6-1', .85)
msft_trailing_stop.head()

[*********************100%***********************]  1 of 1 completed


Ticker,MSFT
Date,Unnamed: 1_level_1
2017-06-01,54.529771
2017-06-02,55.821081
2017-06-05,56.225543
2017-06-06,56.412252
2017-06-07,56.412252
