In [None]:
import re
import numpy as np
import pandas as pd

housing = pd.DataFrame(columns = ['houseID','builder','dateBuilt','datePriced','garden','dockDistance','capitalDistance','marketDistance','towerDistance','riverDistance','knightDistance','renovated','diningRooms','bedRooms','bathRooms','kingVisit','cursed','blessings','farmland','location','holyTree'])

builders = ['Bob','Bright_Brothers','Masters_of_Stones','The_Greens','The_Kings','The_Lannisters','The_Ollivers','The_Overlords','The_Starks','Wood_Priests','Not_Known']

for builder in builders:
    file = open(builder+".txt")
    rawData = file.read()
    rawData = rawData.lower()
    # print(rawData)
    houses = re.split(r'\n{2,}',rawData)
    for house in houses:
        if(len(house)==0):
            continue
        #extracting houseID
        match = re.search(r'\w+\s*:\s*(\w{8})', house)
        houseID = match.group(1)

        #extracting presence of garden
        match = re.search(r'\n[\w\s]*garden[\w\s]*\n', house)
        if(match):
            match2 = re.search(r'.*no.*',match.group())
            if(match2):
                garden = 0
            else:
                garden = 1
        else:
            garden = np.nan

        #extracting distance from dock
        match = re.search(r'.*distance.*dock[a-zA-Z\s]*([0-9]+\.[0-9]*).*\n', house)
        if match:
            dockDistance = match.group(1)
        else:
            dockDistance = np.nan

        #extracting distance from capital
        match = re.search(r'.*distance.*capital[a-zA-Z\s]*([0-9]+\.[0-9]*).*\n', house)
        if match:
            capitalDistance = match.group(1)
        else:
            capitalDistance = np.nan

        #extracting distance from royal market
        match = re.search(r'.*distance.*market[a-zA-Z\s]*([0-9]+\.[0-9]*).*\n', house)
        if match:
            marketDistance = match.group(1)
        else:
            marketDistance = np.nan

        #extracting distance from guarding tower
        match = re.search(r'.*distance.*tower[a-zA-Z\s]*([0-9]+\.[0-9]*).*\n', house)
        if match:
            towerDistance = match.group(1)
        else:
            towerDistance = np.nan

        #extracting distance from river
        match = re.search(r'.*distance.*river[a-zA-Z\s]*([0-9]+\.?[0-9]*).*\n', house)
        if match:
            riverDistance = match.group(1)
        else:
            riverDistance = np.nan

        # extracting renovation status
        match = re.search(r'.*renovation.*\n', house)
        if match:
            match2 = re.search(r'.*mighty.*', match.group())
            if match2:
                renovated = 1
            else:
                renovated = 0
        else:
            renovated = np.nan

        #extracting number of dining rooms
        match = re.search(r'([0-9]+).*dining.*\n', house)
        if match:
            diningRooms = match.group(1)
        else:
            diningRooms = np.nan

        #extracting number of bedrooms
        match = re.search(r'([0-9]+).*bedroom.*\n', house)
        if match:
            bedRooms = match.group(1)
        else:
            bedRooms = np.nan

        #extracting number of bathrooms
        match = re.search(r'([0-9]+).*bathroom.*\n', house)
        if match:
            bathRooms = match.group(1)
        else:
            bathRooms = np.nan

        #extracting whether king has visited the house or not
        match = re.search(r'.*visit.*\n', house)
        match2 = re.search(r'.*great.*', match.group())
        if match:
            if match2:
                kingVisit = 1
            else:
                kingVisit = 0
        else:
            kingVisit = np.nan

        #extracting whether house is cursed by sorcerer or not
        match = re.search(r'.*sorcerer.*\n', house)
        if match:
            match2 = re.search(r'.*!!\n', match.group())
            if match2:
                cursed = 1
            else:
                cursed = 0
        else:
            cursed = np.nan

        #extracting number of blessings
        match = re.search(r'[a-z ]*([0-9]+).*blessing.*\n', house)
        if match:
            blessings = match.group(1)
        else:
            blessings = np.nan

        #extracting presence of land of farm (no = 0, small = 1, huge = 2)
        match = re.search(r'.*farm.*\n', house)
        if match:
            if(re.search(r'.*small.*', match.group())):
                farmland = 1
            elif(re.search(r'.*huge.*', match.group())):
                farmland = 2
            else:
                farmland = 0
        else:
            farmland = np.nan

        #extracting location of the house
        match = re.search(r'.*location.*:(.*)\n', house)
        if match:
            location = match.group(1).strip()
        else:
            location = np.nan

        #extracting information about holy tree
        match = re.search(r'.*holy.*tree.*\n', house)
        if match:
            match2 = re.search(r'.*witch.*\n', match.group())
            if match2:
                holyTree = 0
            else:
                holyTree = 1
        else:
            holyTree = np.nan

        #extracting distance from knight's house
        match = re.search(r'.*distance.*knight[a-z\s\']*([0-9]+\.?[0-9]*).*', house)
        if match:
            knightDistance = match.group(1)
        else:
            knightDistance = np.nan

        #extracting dateBuilt and datePriced
        match = re.search(r'date[a-z\s:]*([0-9/:\spa]*).*date[a-z\s:]*([0-9/:\spa]*).*\n', house)
        if match:
            dateBuilt = match.group(1)+'m'
            datePriced = match.group(2)+'m'
        else:
            dateBuilt = np.nan
            datePriced = np.nan

        #finally extracted everything
        #inserting the extracted row to dataframe
        housing.loc[-1] = [houseID,builder,dateBuilt,datePriced,garden,dockDistance,capitalDistance,marketDistance,towerDistance,riverDistance,knightDistance,renovated,diningRooms,bedRooms,bathRooms,kingVisit,cursed,blessings,farmland,location,holyTree]
        housing.index = housing.index + 1
housing = housing.sort_index()
print(housing)

house_prices = pd.read_csv('house_prices.csv')
missing = pd.read_csv('missing.csv')
housing.sort_values(by='houseID',inplace=True)
house_prices.sort_values(by='House ID',inplace=True)
housing.reset_index(inplace=True)
housing['goldenGrains']=house_prices['Golden Grains']
train = housing.dropna(subset=['goldenGrains'], inplace=False)
test = housing[housing['goldenGrains'].isnull()]
test=test.drop('goldenGrains',axis='columns',inplace = False)
train = train.sort_index()
test = test.sort_index()
train.index+=1
test.index+=1
train.to_csv('train.csv')
test.to_csv('test.csv')

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn import linear_model
import datetime
iris=pd.read_csv("train.csv")
df=pd.DataFrame(iris)
### Separating hours,minutes,dates,years for dateBuilt
df['dateBuilt']=df['dateBuilt'].apply(lambda x:x.upper())
for i in range(16500):
    string=str(df.iloc[i,3])
    index=re.search(r'[0-9]+/[0-9]+(/.*)',string).start(1);
    li=list(string)

    li[index+2]='9'
    stre = ''.join(str(e) for e in li)
    df.iloc[i,3]=stre

df['dateBuilt']=pd.to_datetime(df.dateBuilt)
df['HOUR'] = df['dateBuilt'].dt.hour
df['MONTH'] = df['dateBuilt'].dt.month
df['DATE'] = df['dateBuilt'].dt.day
df['YEAR'] = df['dateBuilt'].dt.year
###
### Separating hours,minutes,dates,years for datePriced

df['datePriced']=df['datePriced'].apply(lambda x:x.upper())
for i in range(16500):
    string=str(df.iloc[i,4])
    index=re.search(r'[0-9]+/[0-9]+(/.*)',string).start(1);
    li=list(string)

    li[index+2]='9'
    stre = ''.join(str(e) for e in li)
    df.iloc[i,4]=stre

df['datePriced']=pd.to_datetime(df.datePriced)
df['HOURPriced'] = df['datePriced'].dt.hour
df['MONTHPriced'] = df['datePriced'].dt.month
df['DATEPriced'] = df['datePriced'].dt.day
df['YEARPriced'] = df['datePriced'].dt.year
###

### Enumerating builders for analysis and training 
target_map = {'Bob':0, 'Bright_Brothers':1,'Masters_of_Stones':2,'Not_Known':3,'The_Greens':4,'The_Kings':5,'The_Lannisters':5,'The_Ollivers':6,'The_Overlords':7,'The_Starks':8,'Wood_Priests':9 }
# Use the pandas apply method to numerically encode our attrition target variable
df['builder'] = df['builder'].apply(lambda x: target_map[x])
###

### Filling the training data (NaN values)
df['knightDistance']=df['knightDistance'].fillna(34.365941)
df['capitalDistance']=df['capitalDistance'].fillna(34.365941)
df['marketDistance']=df['marketDistance'].fillna(48.715817)
df['towerDistance']=df['towerDistance'].fillna(101.110421)
df['dockDistance']=df['dockDistance'].fillna(46.305520)
df['location']=df['location'].fillna('the mountains')
df['farmland']=df['farmland'].fillna(1)
df['diningRooms']=df['diningRooms'].fillna(3)
df['bathRooms']=df['bathRooms'].fillna(2)
df['bedRooms']=df['bedRooms'].fillna(3)
df['holyTree']=df['holyTree'].fillna(3)
df['cursed']=df['cursed'].fillna(0)
df['renovated']=df['renovated'].fillna(0)
df['garden']=df['garden'].fillna(0)
df=df.assign(prod=(df['bedRooms']+df['bathRooms']+df['diningRooms'])//3.0)
df=df.assign(proddis=(df['dockDistance']*df['marketDistance']*df['towerDistance']*df['riverDistance'])**(1/3))
df=df.assign(proddis2=np.sqrt(df['capitalDistance']*df['knightDistance']))
###

### Enumerating location for analysis and training 
target_map={'the mountains':0,"king's landing": 1,"servant's premises":2,"cursed land":3}
df['location']=df['location'].apply(lambda x: target_map[x])
###

k=4
tr=df
tE=pd.read_csv('test.csv')
te=pd.DataFrame(tE)
######

te['datePriced']=te['datePriced'].apply(lambda x:x.upper())
for i in range(3500):
    string=str(te.iloc[i,4])
    index=re.search(r'[0-9]+/[0-9]+(/.*)',string).start(1);
    li=list(string)

    li[index+2]='9'
    stre = ''.join(str(e) for e in li)
    te.iloc[i,4]=stre

te['datePriced']=pd.to_datetime(te.datePriced)
te['HOURPriced'] = te['datePriced'].dt.hour
te['MONTHPriced'] = te['datePriced'].dt.month
te['DATEPriced'] = te['datePriced'].dt.day
te['YEARPriced'] = te['datePriced'].dt.year

te['dateBuilt']=te['dateBuilt'].apply(lambda x:x.upper())
for i in range(3500):
    string=str(te.iloc[i,3])
    index=re.search(r'[0-9]+/[0-9]+(/.*)',string).start(1);
    li=list(string)

    li[index+2]='9'
    stre = ''.join(str(e) for e in li)
    te.iloc[i,3]=stre

te['dateBuilt']=pd.to_datetime(te.dateBuilt)
te['HOUR'] = te['dateBuilt'].dt.hour
te['MONTH'] = te['dateBuilt'].dt.month
te['DATE'] = te['dateBuilt'].dt.day
te['YEAR'] = te['dateBuilt'].dt.year



### Doing the same for the test data 
te['knightDistance']=te['knightDistance'].fillna(34.365941)
te['capitalDistance']=te['capitalDistance'].fillna(34.365941)
te['marketDistance']=te['marketDistance'].fillna(48.715817)
te['towerDistance']=te['towerDistance'].fillna(101.110421)
te['dockDistance']=te['dockDistance'].fillna(46.305520)
te['location']=te['location'].fillna('the mountains')
te['farmland']=te['farmland'].fillna(1)
te['diningRooms']=te['diningRooms'].fillna(3)
te['bathRooms']=te['bathRooms'].fillna(2)
te['bedRooms']=te['bedRooms'].fillna(3)
te['holyTree']=te['holyTree'].fillna(3)
te['cursed']=te['cursed'].fillna(0)
te['renovated']=te['renovated'].fillna(0)
te['garden']=te['garden'].fillna(0)
te=te.assign(prod=(te['bedRooms']+te['bathRooms']+te['diningRooms'])//3.0)

target_map={'the mountains':0,"king's landing": 1,"servant's premises":2,"cursed land":3}
te['location']=te['location'].apply(lambda x: target_map[x])
target_map = {'Bob':0, 'Bright_Brothers':1,'Masters_of_Stones':2,'Not_Known':3,'The_Greens':4,'The_Kings':5,'The_Lannisters':5,'The_Ollivers':6,'The_Overlords':7,'The_Starks':8,'Wood_Priests':9 }
# Use the pandas apply method to numerically encode our attrition target variable

te['builder'] = te['builder'].apply(lambda x: target_map[x])
### 

model=linear_model.LinearRegression(normalize=True)###using linear regression
tr_X = tr[[ 'builder','dockDistance',  'marketDistance', 'towerDistance', 'riverDistance','capitalDistance','diningRooms','bedRooms',
 'bathRooms' ,'kingVisit', 'cursed', 'blessings', 'farmland', 'location', 'holyTree','YEAR','YEARPriced','MONTH','MONTHPriced','HOUR','prod']]# taking the training data features

tr_y = tr[['goldenGrains']]# output of our training data
te_X = te[[ 'builder','dockDistance',  'marketDistance', 'towerDistance', 'riverDistance','capitalDistance','diningRooms','bedRooms',
 'bathRooms' ,'kingVisit', 'cursed', 'blessings', 'farmland', 'location', 'holyTree','YEAR','YEARPriced','MONTH','MONTHPriced','HOUR','prod']]# taking the training data features


model.fit(tr_X,tr_y)
prediction = model.predict(te_X)
prediction
pred=pd.DataFrame(prediction)
pred.insert(loc=0,column='houseID',value=te['houseID'])
pred.to_csv(path_or_buf='answer4final.csv',header=['House ID','Golden Grains'],mode='w',index=False)




