# China customs history

Convert our history files to the format from China Customs:
    
    - 2006 - 2016: G:\OMRrefinery\China\Exportimport\China Customs History\China_Customs_history.csv
    - 1995 - 2015: G:\OMRrefinery\China\Exportimport\China Import History.xls
    
Columns to output:

"Date of data","Commodity code","Commodity","Trading partner code","Trading partner","Quantity","Unit","Supplimentary Quantity","Supplimentary Unit","Renminbi Yuan",


# 2006-2016 (old)

There is a bug in the source file, ignore this.

In [40]:
import pandas as pd

df = pd.read_csv('G:\\OMRrefinery\\China\\Exportimport\\China Customs History\\China_Customs_history.csv', parse_dates=['Period'], dayfirst=True)
df = df[['Country Code', 'Country Name', 'Product Code', 'Unit', 'Period', 'Value']]
df['Date of data'] = df.Period.dt.strftime("%Y%m")
df['Commodity'] = 'Petroleum oils&oils obtained from bituminous minerals, crude'
df['Unit'] = 'Kilograms'
df["Supplimentary Quantity"] = "0"
df["Supplimentary Unit"] = '-'
df["Renminbi Yuan"] = None

df.rename(columns={"Product Code": "Commodity code", 
                    "Country Code": "Trading partner code", 
                    "Country Name": "Trading partner",
                    "Value": "Quantity"}, inplace=True)

df = df[["Date of data","Commodity code","Commodity","Trading partner code","Trading partner","Quantity","Unit","Supplimentary Quantity","Supplimentary Unit","Renminbi Yuan"]].sort_values(by=['Date of data', 'Trading partner code'])
df.head()

Unnamed: 0,Date of data,Commodity code,Commodity,Trading partner code,Trading partner,Quantity,Unit,Supplimentary Quantity,Supplimentary Unit,Renminbi Yuan
1791,200601,27090000,Petroleum oils&oils obtained from bituminous m...,105,Brunei,81965000,Kilograms,0,-,
2653,200601,27090000,Petroleum oils&oils obtained from bituminous m...,106,Myanmar,0,Kilograms,0,-,
10337,200601,27090000,Petroleum oils&oils obtained from bituminous m...,109,North Korea,51130000,Kilograms,0,-,
2767,200601,27090000,Petroleum oils&oils obtained from bituminous m...,110,Hong Kong,0,Kilograms,0,-,
10932,200601,27090000,Petroleum oils&oils obtained from bituminous m...,110,Hong Kong,0,Kilograms,0,-,


In [41]:
df.to_csv('cn_customs_crudeoil_imp_2006_2016.csv', index=False)

# 2006 - 2016 (new)

In [75]:
import pandas as pd
df = pd.read_excel('G:\\OMRrefinery\\China\\Exportimport\\China Customs History\\China_Customs_history_2006-2016.xlsx', 'Sheet1')\
       .replace(0,None)\
       .melt(id_vars=['Code_cn', 'code', 'country'], var_name='period', value_name='value')\
       .dropna(subset=['value'])
# "Date of data","Commodity code","Commodity","Trading partner code","Trading partner","Quantity","Unit","Supplimentary Quantity","Supplimentary Unit","Renminbi Yuan",
df = df.rename(columns={'Code_cn': 'Trading partner code',
           'country': 'Trading partner',
           'value': 'Quantity'})\
       .assign(**{'Date of data' : df.period.dt.strftime("%Y%m"),
                  'Commodity code': '27090000',
                  'Commodity': 'Petroleum oils&oils obtained from bituminous minerals, crude',
                  'Unit': 'Kilograms',
                  'Supplimentary Quantity': '0',
                  'Supplimentary Unit': '-',
                  'Renminbi Yuan': None})\
       .drop(columns=['code', 'period'])
df = df[["Date of data","Commodity code","Commodity","Trading partner code","Trading partner","Quantity","Unit","Supplimentary Quantity","Supplimentary Unit","Renminbi Yuan"]]
df.to_csv('cn_customs_crudeoil_imp_2006_2016.csv', index=False)

Unnamed: 0,Date of data,Commodity code,Commodity,Trading partner code,Trading partner,Quantity,Unit,Supplimentary Quantity,Supplimentary Unit,Renminbi Yuan
0,200601,27090000,Petroleum oils&oils obtained from bituminous m...,126,Oman,1365277.0,Kilograms,0,-,
1,200601,27090000,Petroleum oils&oils obtained from bituminous m...,113,Iran,1891841.0,Kilograms,0,-,
2,200601,27090000,Petroleum oils&oils obtained from bituminous m...,131,Saudi Arabia,1604366.0,Kilograms,0,-,
3,200601,27090000,Petroleum oils&oils obtained from bituminous m...,139,Yemen,353846.0,Kilograms,0,-,
4,200601,27090000,Petroleum oils&oils obtained from bituminous m...,114,Iraq,353846.0,Kilograms,0,-,
5,200601,27090000,Petroleum oils&oils obtained from bituminous m...,130,Qatar,135634.0,Kilograms,0,-,
6,200601,27090000,Petroleum oils&oils obtained from bituminous m...,138,UAE,569072.0,Kilograms,0,-,
7,200601,27090000,Petroleum oils&oils obtained from bituminous m...,118,Kuwait,139773.0,Kilograms,0,-,
8,200601,27090000,Petroleum oils&oils obtained from bituminous m...,135,Syria,139773.0,Kilograms,0,-,
9,200601,27090000,Petroleum oils&oils obtained from bituminous m...,112,Indonesia,486129.0,Kilograms,0,-,


# 1995 - 2005

In [149]:
# 1995 - 2005
country_mapping = {'Russia': 'Russian Federation',
                   'Saudi': 'Saudi Arabia',
                   'Congo': 'Republic of Congo',
                   'Libya': 'Libyan Arab Jamahiriya',
                   'UAE': 'United Arab Emirates',
                   'Democratic Republic of the Congo': 'Congo (Gold)',
                   'South Sudan': 'Republic of South Sudan',
                   'US': 'United States',
                   'UK': 'United Kingdom'}


df = pd.read_excel('G:\OMRrefinery\China\Exportimport\China Import History.xls', 'Oil Imports').query('Period.dt.year < 2006')
df = df.assign(**{'Date of data' : df.Period.dt.strftime("%Y%m"),
                'Commodity code': '27090000',
                'Commodity': 'Petroleum oils&oils obtained from bituminous minerals, crude',
                'Unit': 'Kilograms',
                'Supplimentary Quantity': '0',
                'Supplimentary Unit': '-',
                'Renminbi Yuan': None})\
       .rename(columns={"Product Code": "Commodity code",
                        "Attribute": "Trading partner",
                    "Country Code": "Trading partner code",
                    "Value": "Quantity"})
df['Trading partner'] = df['Trading partner'].apply(lambda x : country_mapping[x] if x in country_mapping else x)

In [150]:
df.head()


Unnamed: 0,Period,Trading partner,Quantity,Date of data,Commodity code,Commodity,Unit,Supplimentary Quantity,Supplimentary Unit,Renminbi Yuan
0,1995-01-31,Oman,139218345,199501,27090000,Petroleum oils&oils obtained from bituminous m...,Kilograms,0,-,
1,1995-01-31,Indonesia,149962998,199501,27090000,Petroleum oils&oils obtained from bituminous m...,Kilograms,0,-,
2,1995-02-28,Indonesia,209311244,199502,27090000,Petroleum oils&oils obtained from bituminous m...,Kilograms,0,-,
3,1995-03-31,Angola,250294188,199503,27090000,Petroleum oils&oils obtained from bituminous m...,Kilograms,0,-,
4,1995-03-31,Oman,414925988,199503,27090000,Petroleum oils&oils obtained from bituminous m...,Kilograms,0,-,


In [151]:
mapping = pd.read_excel('cn_customs_mappings.xlsx', 'COUNTRIES')

In [152]:
merged = df.merge(mapping, left_on = 'Trading partner', right_on = 'country_name_en', how='left', indicator='join_side').rename(columns={'country_code': 'Trading partner code'})
if len(merged.query('join_side == "left_only"')[['Trading partner']].drop_duplicates()) > 0:
        raise ValueError("still wrong names")
merged = merged[["Date of data","Commodity code","Commodity","Trading partner code","Trading partner","Quantity","Unit","Supplimentary Quantity","Supplimentary Unit","Renminbi Yuan"]].sort_values(by=['Date of data', 'Trading partner code'])
merged.head()

Unnamed: 0,Date of data,Commodity code,Commodity,Trading partner code,Trading partner,Quantity,Unit,Supplimentary Quantity,Supplimentary Unit,Renminbi Yuan
1,199501,27090000,Petroleum oils&oils obtained from bituminous m...,112,Indonesia,149962998,Kilograms,0,-,
0,199501,27090000,Petroleum oils&oils obtained from bituminous m...,126,Oman,139218345,Kilograms,0,-,
2,199502,27090000,Petroleum oils&oils obtained from bituminous m...,112,Indonesia,209311244,Kilograms,0,-,
5,199503,27090000,Petroleum oils&oils obtained from bituminous m...,112,Indonesia,796453975,Kilograms,0,-,
4,199503,27090000,Petroleum oils&oils obtained from bituminous m...,126,Oman,414925988,Kilograms,0,-,


In [153]:
merged['Date of data'].drop_duplicates()

1       199501
2       199502
5       199503
11      199504
18      199505
29      199506
34      199507
40      199508
47      199509
53      199510
57      199511
64      199512
70      199701
77      199702
82      199703
89      199704
96      199705
103     199706
111     199707
120     199708
130     199709
139     199710
147     199711
158     199712
167     199801
178     199802
190     199803
199     199804
205     199805
214     199806
         ...  
1025    200307
1039    200308
1055    200309
1073    200310
1092    200311
1107    200312
1123    200401
1138    200402
1154    200403
1171    200404
1192    200405
1211    200406
1229    200407
1245    200408
1264    200409
1282    200410
1299    200411
1316    200412
1332    200501
1348    200502
1365    200503
1385    200504
1406    200505
1423    200506
1441    200507
1461    200508
1479    200509
1500    200510
1519    200511
1536    200512
Name: Date of data, Length: 120, dtype: object

In [154]:
merged.to_csv('cn_customs_crudeoil_imp_1995_2005.csv', index=False)

# Load history into External-DB

In [72]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
import scraper.jobs.cn_customs.crude_imports_job as crude_imports_job
import logging
crude_imports_job.logger.level = logging.DEBUG

cn = crude_imports_job.CrudeImportsJob(full_load=True)

cn.run()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [17]:
cn.transform()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [30]:
cn.data[1:10]

[{'source': 'cn_customs_crudeoil_imp_2006',
  'period': 'JAN2006',
  'to_area': 'MYANMAR',
  'unit': 'KT',
  'value': 0.0,
  'provider': 'CN_CUSTOMS',
  'area': 'CHINA',
  'product': 'CRUDEOIL',
  'frequency': 'Monthly',
  'flow': 'IMPORTS',
  'original': True},
 {'source': 'cn_customs_crudeoil_imp_2006',
  'period': 'JAN2006',
  'to_area': 'KOREADPR',
  'unit': 'KT',
  'value': 51.13,
  'provider': 'CN_CUSTOMS',
  'area': 'CHINA',
  'product': 'CRUDEOIL',
  'frequency': 'Monthly',
  'flow': 'IMPORTS',
  'original': True},
 {'source': 'cn_customs_crudeoil_imp_2006',
  'period': 'JAN2006',
  'to_area': 'HONGKONG',
  'unit': 'KT',
  'value': 0.0,
  'provider': 'CN_CUSTOMS',
  'area': 'CHINA',
  'product': 'CRUDEOIL',
  'frequency': 'Monthly',
  'flow': 'IMPORTS',
  'original': True},
 {'source': 'cn_customs_crudeoil_imp_2006',
  'period': 'JAN2006',
  'to_area': 'INDIA',
  'unit': 'KT',
  'value': 0.0,
  'provider': 'CN_CUSTOMS',
  'area': 'CHINA',
  'product': 'CRUDEOIL',
  'frequency':

In [37]:
import pandas as pd
df = pd.DataFrame(cn.data)


In [38]:
#df.dropna(axis='index', subset=['value'],inplace=True)
df.isnull().values.any()

False

In [43]:
df

Unnamed: 0,area,flow,frequency,original,period,product,provider,source,to_area,unit,value
0,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,BRUNEI,KT,81.965000
1,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,MYANMAR,KT,0.000000
2,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,KOREADPR,KT,51.130000
3,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,HONGKONG,KT,0.000000
4,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,INDIA,KT,0.000000
5,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,INDONESIA,KT,486.129000
6,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,INDONESIA,KT,34.531000
7,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,INDONESIA,KT,0.000000
8,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,IRAN,KT,1891.841000
9,CHINA,IMPORTS,Monthly,True,JAN2006,CRUDEOIL,CN_CUSTOMS,cn_customs_crudeoil_imp_2006,IRAN,KT,0.000000


In [23]:
cn.sources

['BaseSource(c...imp_2006.csv)', 'BaseSource(c...imp_1995.csv)']

In [13]:
df.index

Index([], dtype='object')

In [3]:
import pandas as pd
data = pd.DataFrame([{'country': 'AUSTRALI'}])
data.isin(['AUSTRALIA', 'BRAZIL'])

Unnamed: 0,country
0,False


In [7]:
len(data.index)

1