In [1]:
#import pandas library and load and read the data using pandas
import pandas as pd

dataset = pd.read_csv('CTT10.csv')
dataset.head()

Unnamed: 0,Data,Último,Abertura,Alta,Baixa,Vol.,Var. %
0,12.04.2021,350,357,364,346,"1,57M","-1,55%"
1,09.04.2021,356,352,357,352,"666,03K","1,14%"
2,08.04.2021,352,345,354,345,"882,54K","1,59%"
3,07.04.2021,346,343,349,343,"508,33K","0,87%"
4,06.04.2021,343,344,349,343,"809,46K","0,59%"


In [2]:
#read all the columns
print(dataset.columns)
print(dataset.shape)

Index(['Data', 'Último', 'Abertura', 'Alta', 'Baixa', 'Vol.', 'Var. %'], dtype='object')
(1875, 7)


In [3]:
#replace them the english names
new_dataset = dataset.rename(columns={'Data':'Date', 'Último':'Last', 'Abertura':'Opening', 'Alta':'High', 'Baixa':'Low'})
new_dataset.columns

Index(['Date', 'Last', 'Opening', 'High', 'Low', 'Vol.', 'Var. %'], dtype='object')

In [4]:
#read the dataset again
new_dataset.head(5)

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %
0,12.04.2021,350,357,364,346,"1,57M","-1,55%"
1,09.04.2021,356,352,357,352,"666,03K","1,14%"
2,08.04.2021,352,345,354,345,"882,54K","1,59%"
3,07.04.2021,346,343,349,343,"508,33K","0,87%"
4,06.04.2021,343,344,349,343,"809,46K","0,59%"


Hence we can see that the column names are changed now

In [5]:
#replace the comma with a point in the numbers
new_dataset = new_dataset.stack().str.replace(',','.').unstack()
new_dataset.head()

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %
0,12.04.2021,3.5,3.57,3.64,3.46,1.57M,-1.55%
1,09.04.2021,3.56,3.52,3.57,3.52,666.03K,1.14%
2,08.04.2021,3.52,3.45,3.54,3.45,882.54K,1.59%
3,07.04.2021,3.46,3.43,3.49,3.43,508.33K,0.87%
4,06.04.2021,3.43,3.44,3.49,3.43,809.46K,0.59%


In [6]:
#convert M and K into respective number
def sign_to_number(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0

new_dataset['Vol.'] = new_dataset['Vol.'].apply(sign_to_number)

new_dataset.head()

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %
0,12.04.2021,3.5,3.57,3.64,3.46,1570000.0,-1.55%
1,09.04.2021,3.56,3.52,3.57,3.52,666030.0,1.14%
2,08.04.2021,3.52,3.45,3.54,3.45,882540.0,1.59%
3,07.04.2021,3.46,3.43,3.49,3.43,508330.0,0.87%
4,06.04.2021,3.43,3.44,3.49,3.43,809460.0,0.59%


In [7]:
#add one column at the last and assign negative or positive by looking at the variation column

#here i am removing % sign
new_dataset['Var. %'] = new_dataset['Var. %'].str.replace(r'%', '')


#here I am changing the series into list
lst = new_dataset['Var. %'].tolist()


#change string into float
for i in range(0, len(lst)):
    lst[i] = float(lst[i])


#new list created and appended the category into the list
new_column = []
for i in range(len(lst)):
    if(lst[i] > 0):
        new_column.append("positive")
    elif (lst[i] < 0):
        new_column.append("negative")
    else:
        new_column.append("neutral")
print(new_column)
print(len(new_column))

#now append the list column into our dataset and our new column is "category"
new_dataset['category'] = new_column

new_dataset

['negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'neutral', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'neutral', 'negative', 'positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'positive', 'neutral', 'positive', 'negative', 'neutral', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category
0,12.04.2021,3.50,3.57,3.64,3.46,1570000.0,-1.55,negative
1,09.04.2021,3.56,3.52,3.57,3.52,666030.0,1.14,positive
2,08.04.2021,3.52,3.45,3.54,3.45,882540.0,1.59,positive
3,07.04.2021,3.46,3.43,3.49,3.43,508330.0,0.87,positive
4,06.04.2021,3.43,3.44,3.49,3.43,809460.0,0.59,positive
...,...,...,...,...,...,...,...,...
1870,12.12.2013,5.74,5.75,5.76,5.71,725190.0,-0.52,negative
1871,11.12.2013,5.77,5.69,5.79,5.67,1370000.0,1.23,positive
1872,10.12.2013,5.70,5.74,5.79,5.70,1430000.0,-2.06,negative
1873,09.12.2013,5.82,5.53,5.82,5.52,5800000.0,5.24,positive


In [8]:
#change the date format into month-day-year
from datetime import datetime as dt
from datetime import timedelta as td

#here we are converting the date object into in Date
new_dataset['Date'] = pd.to_datetime(new_dataset.Date)

#here I am changing the date format
new_dataset['Date'] = new_dataset['Date'].dt.strftime('%d-%m-%Y')

new_dataset.head()

  new_dataset['Date'] = pd.to_datetime(new_dataset.Date)


Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category
0,04-12-2021,3.5,3.57,3.64,3.46,1570000.0,-1.55,negative
1,04-09-2021,3.56,3.52,3.57,3.52,666030.0,1.14,positive
2,04-08-2021,3.52,3.45,3.54,3.45,882540.0,1.59,positive
3,04-07-2021,3.46,3.43,3.49,3.43,508330.0,0.87,positive
4,04-06-2021,3.43,3.44,3.49,3.43,809460.0,0.59,positive


# 1- Write a function to correct these points and save the file with the name of cttcorrcted.csv

In [9]:
#saving the file as a csv file

new_dataset.to_csv('cttcorrcted.csv')

# 2- What was the date with the highest volatility (difference between min and max)

In [10]:
#answer


#we are converting series into list
high_list = new_dataset['High'].tolist()
low_list = new_dataset['Low'].tolist()

#initiate new list so that the difference can be added there
diff_lst = []

#using loop to calculate each difference 
for i in range(len(new_dataset)):
    difference = float(high_list[i]) - float(low_list[i])
    diff_lst.append(difference)
    
#now append this list to column into the dataset
new_dataset['diff_lst'] = diff_lst

#since another column with differences is added now find the date for the maximum in the new column
new_dataset[new_dataset['diff_lst'] == new_dataset['diff_lst'].max()]


Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category,diff_lst
1440,24-08-2015,8.88,9.01,9.01,8.12,750800.0,-3.87,negative,0.89


# 3- What was the best month (30 days) during the 10 years to win with this title This means if I had to keep my stock only 30 days when was the best 30 days?

In [13]:
#30 largest values from Vol. column
# new_dataset.nlargest(30, ['Vol.'])

new_dataset['Date'] = pd.to_datetime(new_dataset['Date'])
# new_dataset['Date'].dt.to_period('M')
new_dataset['Date']

  new_dataset['Date'] = pd.to_datetime(new_dataset['Date'])


0      2021-04-12
1      2021-04-09
2      2021-04-08
3      2021-04-07
4      2021-04-06
          ...    
1870   2013-12-12
1871   2013-12-11
1872   2013-12-10
1873   2013-12-09
1874   2013-12-06
Name: Date, Length: 1875, dtype: datetime64[ns]

In [14]:
#converting into date format

new_dataset['Date'].dt.to_period('M')


0       2021-04
1       2021-04
2       2021-04
3       2021-04
4       2021-04
         ...   
1870    2013-12
1871    2013-12
1872    2013-12
1873    2013-12
1874    2013-12
Name: Date, Length: 1875, dtype: period[M]

In [46]:
#find specific month
month_data = new_dataset[new_dataset['Date'].dt.month == 1].nlargest(30, 'Date')


# 4- What was the duration of the longest sequence of ascent of the title in the closings?

In [38]:
#lets find the maximum 30 values in last column

#converting object into float
new_dataset = new_dataset.astype({'Last':'float'})


In [47]:
#we can sort the Last column in ascending order to find the sequence
seq = new_dataset.sort_values(by = 'Last', ascending = True)
seq.tail(50)

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category,diff_lst,maxDiff_vol
1408,2015-10-07,10.05,10.24,10.27,10.05,459750.0,-2.0,negative,0.22,101145.0
1396,2015-10-23,10.06,10.0,10.13,9.99,329700.0,0.82,positive,0.14,46158.0
1529,2015-04-20,10.07,10.03,10.18,9.91,451120.0,0.73,positive,0.27,121802.4
1521,2015-04-30,10.07,10.02,10.07,9.96,599750.0,0.55,positive,0.11,65972.5
1407,2015-10-08,10.08,10.05,10.15,10.03,475020.0,0.3,positive,0.12,57002.4
1519,2015-05-05,10.08,10.43,10.43,10.06,513420.0,-2.84,negative,0.37,189965.4
1411,2015-10-02,10.09,10.0,10.09,9.97,289960.0,1.41,positive,0.12,34795.2
1514,2015-05-12,10.1,10.11,10.2,9.95,538850.0,-0.69,negative,0.25,134712.5
1507,2015-05-21,10.1,10.11,10.26,10.03,379470.0,-0.54,negative,0.23,87278.1
1415,2015-09-28,10.11,10.18,10.19,10.06,267610.0,-0.64,negative,0.13,34789.3


since the logest sequence in 2015 where all the values in closing are high

# 5- What was the date that saw the greatest turmoil in the market, ie large volumes with important variations? You can choose for example volume * (max-min) to get a measure of turbulence).

In [25]:
#initiate new list for volume*(high - low)
max_diff_vol = []

#using loop to calculate each entry's product respectively  
for i in range(len(new_dataset)):
    product = (new_dataset['Vol.'][i])*(new_dataset['diff_lst'][i])
    max_diff_vol.append(product)
    
    
new_dataset['maxDiff_vol'] = max_diff_vol
new_dataset.head()

Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category,diff_lst,maxDiff_vol
0,2021-04-12,3.5,3.57,3.64,3.46,1570000.0,-1.55,negative,0.18,282600.0
1,2021-04-09,3.56,3.52,3.57,3.52,666030.0,1.14,positive,0.05,33301.5
2,2021-04-08,3.52,3.45,3.54,3.45,882540.0,1.59,positive,0.09,79428.6
3,2021-04-07,3.46,3.43,3.49,3.43,508330.0,0.87,positive,0.06,30499.8
4,2021-04-06,3.43,3.44,3.49,3.43,809460.0,0.59,positive,0.06,48567.6


In [28]:
#now find the date when maxDiff_vol column is maximum

new_dataset[new_dataset['maxDiff_vol'] == new_dataset['maxDiff_vol'].max()]


Unnamed: 0,Date,Last,Opening,High,Low,Vol.,Var. %,category,diff_lst,maxDiff_vol
876,2017-11-01,3.96,4.5,4.5,3.95,11060000.0,-21.68,negative,0.55,6083000.0
