In [1]:
# module 9
# in this module, we are going to learn how to handle missing values,
# and also how to clean up our data a bit.

# almost every dataset which we are going to working which is likely going to have a missing data.
# or we have to clean up some data and turned them into different datatypes.
# at the end of the module, we will combine what we have learnt throughout the module 
# and calculate the median experience of the developers.

In [18]:
import pandas as pd
import numpy as np

mul_people={
    "first":["Ahammad","Corey","John","Jaden","Boss",np.nan,None,"NA"],
    "last":["Shawki","Schafer","Joker","Joker","Zack",np.nan,np.nan,"Missing"],
    "email":["ahammadshawki8@gmail.com","CoreyMScafer@gmail.com","joker420@outlook.com",
             "jokerOfficial@yahoo.com",None,None,"anonymous@email.com","Missing"],
    "age":["15","27","55","57","22",np.nan,np.nan,"NA"]
}

dic_df=pd.DataFrame(mul_people)

dic_df

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
4,Boss,Zack,,22.0
5,,,,
6,,,anonymous@email.com,
7,,Missing,Missing,


In [3]:
# ok, lets first talk about how to drop missing values.
# we can see that we have some null values in our dataframe.
# np.nan is a numpy Not a number value.

# we might want to handle missing data in many ways.
# one way we might want to do with missing data is 
# to simply remove the rows which doesnt have valid values.
# in order to do this we can use the dropna() method.
dic_df.dropna()

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
7,,Missing,Missing,


In [4]:
# here we still have our 7th row which have custom missing value.
# but for now, lets go over what is actually dropna() doing here?
# now what going in the backgroud is that dropna() is using some default arguement.
# we are going to manually feel in this default arguement. 
# And this might get more sense why we get this specific results.
#dic_df.dropna(axis="index",how="any")
# it has a axis variable set to index and a how variable set to any.
# axis arguement can set to either index columns.
# when it is set to index, it is going to tell pandas
# that we want to drop na values when our rows are missing values.
# when it is set to columns, it is going to tell pandas
# that we want to drop na values form columns instead.
# the second arguement here is just how we want to drop these?
# or this means this is the criteria that it uses for dropping a row or a column.
# so pandas is now looking over our rows and it will drop rows with "any missing values".
# but this might not be that we want.
# maybe depending on what analysis we are doing, its okay to have missing emails,first or last name.
# but there had to be something in the row. we cant afford a entire row of missing values.
# is thats the case, then we can change the how arguement to all.
# this will only drop rows if all of the values of that row is missing.
dic_df.dropna(axis="index",how="all")

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
4,Boss,Zack,,22.0
6,,,anonymous@email.com,
7,,Missing,Missing,


In [5]:
# if we change the axis arguement to columns, it will drop the column that have all the values missing.
# but currently we dont have any column like this.
# so it should give us our original dataframe.
dic_df.dropna(axis="columns",how="all")

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
4,Boss,Zack,,22.0
5,,,,
6,,,anonymous@email.com,
7,,Missing,Missing,


In [6]:
# if we set how arguement to any. then it will give us an empty dataframe.
# because any column which have a single missing values will be dropped.
dic_df.dropna(axis="columns",how="any")

0
1
2
3
4
5
6
7


In [7]:
# we might working with a analysis where we may need to drop the rows 
# which have missing values in only some specific columns not just all or any column. 

# for example, lets say we are working with a anlyssis in our dataframe.
# its fine if the respondant dont have any first or last name.
# but they difinately need an email address.
# if they dont have any email address we need to just drop those rows.
# in order to do this we can pass a subset arguement.
# this subset will be a list of column names that we are checking for missing values.
# remind that it will going to be a list.
dic_df.dropna(axis="index",how="any",subset=["email"])

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
6,,,anonymous@email.com,
7,,Missing,Missing,


In [8]:
# now we can see that, we still have the missing value in email column.
# but that just the custom missing values.
# we will see how to handle that later.

# now in this case here, since we are pasing a sigle column for our subset arguement here, how arguement is doing nothing here.
# because, it will only going to look at the email column for missing values.
# so, it wont matter if we pass "all" instead of "any". it will give us the same thing.
dic_df.dropna(axis="index",how="any",subset=["email"])

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
6,,,anonymous@email.com,
7,,Missing,Missing,


In [9]:
# we can also pass in multiple columns to our subset.
# for example, if we say, in order to make my analysis successful,
# we need either their last name of their email address. but we don't need both.
dic_df.dropna(axis="index",how="all",subset=["last","email"])
# here we need to pass "all" in the how arguement. which means that-
# only the rows will be removed if both last and email column have missing values.
# if we want to change our data permanently, then we need to use inplace arguement and set that equal to true.

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
4,Boss,Zack,,22.0
6,,,anonymous@email.com,
7,,Missing,Missing,


In [10]:
# we can also use "any" arguement in our how arguement.
# that will drop the rows of any of the last or email column have missing values.
dic_df.dropna(axis="index",how="any",subset=["last","email"])

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
7,,Missing,Missing,


In [23]:
# now lets see how to handle custom missing values.
# here we have custom missing values like "NA" and "Missing"

# so how would actually we handle this?
# it depends on how would actually we load our data.
# in this case we created our dataframe in scratch by creating a dictionary and making it a dataframe by pd.DataFrame() method.
# so what we can do here is that simply replace those values with the NaN values using replace method() 
# after we have created the dataframe. we also need to set the inplace arguement equal to True.
# but instead, if we load in our data then we can do it differently.
# we will look at that later.
dic_df.replace("NA",np.nan,inplace=True)
dic_df.replace("Missing",np.nan,inplace=True)
dic_df

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15.0
1,Corey,Schafer,CoreyMScafer@gmail.com,27.0
2,John,Joker,joker420@outlook.com,55.0
3,Jaden,Joker,jokerOfficial@yahoo.com,57.0
4,Boss,Zack,,22.0
5,,,,
6,,,anonymous@email.com,
7,,,,


In [12]:
# if we dont actually want to make any changes, and we just want to see if certain values would and wouldnt be treated as na values.
# then we can run the isna() method to get the mask of values as to whether or not they are classified as na or not.
dic_df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,True,True,True,True
6,True,True,False,True
7,True,True,True,True


In [13]:
# sometimes when we are working with the numeric data,
# we might want to feel our na values with a perticular value, especially 0 or -1.
# we might have to analysis some data where we need to give a score of 0 for na values.
# to do something like this we can use the fillna() method.
dic_df.fillna(0)
# here we haave to pass in the value what we want to fill instead of na values.

Unnamed: 0,first,last,email,age
0,Ahammad,Shawki,ahammadshawki8@gmail.com,15
1,Corey,Schafer,CoreyMScafer@gmail.com,27
2,John,Joker,joker420@outlook.com,55
3,Jaden,Joker,jokerOfficial@yahoo.com,57
4,Boss,Zack,0,22
5,0,0,0,0
6,0,0,anonymous@email.com,0
7,0,0,0,0


In [14]:
# we need to do casting data often when we analysis any data.
# lets say we want to calculate the mean age of the respondants.
# right now, these might actually look like number if we print it in our dataframe.
# but these are actually string.
# and we can see that with dtypes attribute.
dic_df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [15]:
# we can see that all of these columns are object.
# when it says that it is a object then it likely means that it is a string or mix of different datatypes.
# so if we want the avarage age with mean() it would not work.
dic_df["email"].mean()
# this will give us a type error.
# TypeError: can only concatenate str (not "int") to str

TypeError: can only concatenate str (not "int") to str

In [17]:
# so we need to convert that column to number instead of string.
# now, there is a caveat when to doing this and this might throw some people off.
# when we have nan values in our columns we need to convert that to float datatype.
# because the np.nan values are actually float under the hood.
type(np.nan)

float

In [26]:
# we can see that it is a float.
# so if we try to convert this column into a integer this will give us a type error because it can convert those.
# we can change the datatype using astype() function.
dic_df["age"]=dic_df["age"].astype(float)

# so if our column doesn't have any missing values then we can use the int as the arguement of astype().
# but if our column have any missing values then we need to use the float as the arguement of astype().

# remind that we can also convert those values to 0 using fillna() method.
# but that would be a bad idea because we are calculating the mean value. 

In [28]:
dic_df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [29]:
# now lets take the avarage of that column using mean() method.
dic_df["age"].mean()

35.2

In [30]:
# now, if we have a entire dataframe of numbers that we want to convert all at once,
# then the dataframe object has a astype() object as well.
# dic_df.astype()
# but we have mix columns here, so we dont have to do this here.

In [32]:
# now lets apply what we have learned here in the real world stack overflow survey data.
# we have mentioned here, if we have custom missing values in our survey data,
# then its a liitle bit easier to handle this when loading in a csv.

# if we want to ignore this custom values when loading the data from csv 
# then we can simply pass in na_values arguement. it is going to be a list of values that we want to be treated as nan values.

na_vals=["NA","Missing","NaN"]
df=pd.read_csv("stackoverflow/survey_results_public.csv",index_col="Respondent",na_values=na_vals)
schema_df=pd.read_csv("stackoverflow/survey_results_schema.csv",index_col="Column")

pd.set_option("display.max_columns", 85)
pd.set_option("display.max_rows", 85)

df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",,,4.0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a jo...","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,,"Developer, desktop or enterprise applications;...",,17,,,,,,,I am actively looking for a job,I've never had a job,,,Financial performance or funding status of the...,"Something else changed (education, award, medi...",,,,,,,,,,,,,,,,,C++;HTML/CSS;Python,C++;HTML/CSS;JavaScript;SQL,,MySQL,Windows,Windows,Django,Django,,,Atom;PyCharm,Windows,I do not use containers,,Useful across many domains and could change ma...,Yes,Yes,Yes,Instagram,Online,Username,2017,Daily or almost daily,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,11-30 minutes,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, and I don't know what those are","Yes, somewhat",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",100 to 499 employees,"Designer;Developer, back-end;Developer, front-...",3.0,22,1,Slightly satisfied,Slightly satisfied,Not at all confident,Not sure,Not sure,"I’m not actively looking, but I am open to new...",1-2 years ago,Interview with people in peer roles,No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,THB,Thai baht,23000.0,Monthly,8820.0,40.0,There's no schedule or spec; I work on what se...,Distracting work environment;Inadequate access...,Less than once per month / Never,Home,Average,No,,"No, but I think we should",Not sure,I have little or no influence,HTML/CSS,Elixir;HTML/CSS,PostgreSQL,PostgreSQL,,,,Other(s):,,,Vim;Visual Studio Code,Linux-based,I do not use containers,,,Yes,Yes,Yes,Reddit,In real life (in person),Username,2011,A few times per week,Find answers to specific questions;Learn how t...,6-10 times per week,They were about the same,,Yes,Less than once per month or monthly,Yes,"No, I've heard of them, but I am not part of a...",Neutral,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,100 to 499 employees,"Developer, full-stack",3.0,16,Less than 1 year,Very satisfied,Slightly satisfied,Very confident,No,Not sure,I am not interested in new job opportunities,Less than a year ago,"Write code by hand (e.g., on a whiteboard);Int...",No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,USD,United States dollar,61000.0,Yearly,61000.0,80.0,There's no schedule or spec; I work on what se...,,Less than once per month / Never,Home,A little below average,No,,"No, but I think we should",Developers typically have the most influence o...,I have little or no influence,C;C++;C#;Python;SQL,C;C#;JavaScript;SQL,MySQL;SQLite,MySQL;SQLite,Linux;Windows,Linux;Windows,,,.NET,.NET,Eclipse;Vim;Visual Studio;Visual Studio Code,Windows,I do not use containers,Not at all,"Useful for decentralized currency (i.e., Bitcoin)",Yes,SIGH,Yes,Reddit,In real life (in person),Username,2014,Daily or almost daily,Find answers to specific questions;Pass the ti...,1-2 times per week,Stack Overflow was much faster,31-60 minutes,Yes,Less than once per month or monthly,Yes,"No, and I don't know what those are","No, not really",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,"10,000 or more employees","Academic researcher;Developer, desktop or ente...",16.0,14,9,Very dissatisfied,Slightly dissatisfied,Somewhat confident,Yes,No,I am not interested in new job opportunities,Less than a year ago,"Write any code;Write code by hand (e.g., on a ...",No,"Industry that I'd be working in;Languages, fra...",I was preparing for a job search,UAH,Ukrainian hryvnia,,,,55.0,There is a schedule and/or spec (made by me or...,Being tasked with non-development work;Inadequ...,A few days each month,Office,A little above average,"Yes, because I see value in code review",,"Yes, it's part of our process",Not sure,I have little or no influence,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,HTML/CSS;Java;JavaScript;SQL;WebAssembly,Couchbase;MongoDB;MySQL;Oracle;PostgreSQL;SQLite,Couchbase;Firebase;MongoDB;MySQL;Oracle;Postgr...,Android;Linux;MacOS;Slack;Windows,Android;Docker;Kubernetes;Linux;Slack,Django;Express;Flask;jQuery;React.js;Spring,Flask;jQuery;React.js;Spring,Cordova;Node.js,Apache Spark;Hadoop;Node.js;React Native,IntelliJ;Notepad++;Vim,Linux-based,"Outside of work, for personal projects",Not at all,,Yes,Also Yes,Yes,Facebook,In real life (in person),Username,I don't remember,Multiple times per day,Find answers to specific questions,More than 10 times per week,Stack Overflow was much faster,,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, I've heard of them, but I am not part of a...","Yes, definitely",Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [34]:
# in this survey, they had a good job not having weird occurances like this.
# so what we have done in the previous cell, that actually shouldn't change anything.

# lets say we want to calculate the avarage of experience of all responders of this survey.
# in this problem, we will have to apply several concepts that we have learnt through this module.
# so the column whch is used for representing the experience is called "YearsCode"
# lets look at the column.
df["YearsCode"].head(15)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
11      2
12      5
13     17
14     13
15      3
Name: YearsCode, dtype: object

In [35]:
# in first glance thats doesnt look like that we have a problem.
# we might think we can grab the mean of the column simply by saying-
df["YearsCode"].mean()

TypeError: can only concatenate str (not "int") to str

In [39]:
# we can see that we are getting-
#TypeError: can only concatenate str (not "int") to str
# so the column is actually loaded in as a string sntead of a numeric data.
# lets try to convert theat into floats an then take the avarage.
df["YearsCode"]=df["YearsCode"].astype(float)

ValueError: could not convert string to float: 'Less than 1 year'

In [40]:
# we can see that we are still getting an error.
#ValueError: could not convert string to float: 'Less than 1 year'
# lets see all the unique values of this column so that we can exactly see whats in here in case there are more string like this.
# if we want to view unique views of a series we can simply use the unique() method.
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [42]:
# we can see that we have some strings that are mixed throughout this numbers.
# the strings are-
# 'Less than 1 year'
# 'More than 50 years'

# lets replace 'Less than 1 year' to 1.
# lets replace 'More than 50 years' to 51.

df["YearsCode"].replace('Less than 1 year',1,inplace=True)
df["YearsCode"].replace('More than 50 years',51,inplace=True)

In [43]:
# now if we see the unique values again, then we can see that theres no string.
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 1, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [45]:
# now we can use our previour methods and calculate the avarage.
df["YearsCode"]=df["YearsCode"].astype(float)
df["YearsCode"].mean()

11.677659259933135

In [46]:
# we can do other analysis on this dataframe as well.
df["YearsCode"].median()

9.0