# JAMSTEC DBスキーム確認ツールfor Apex(APF9)

Excelファイルを読んでフロートの生データと比較して既に登録用のテーブル・フィールドがあるかどうかを確認する。

フロート生データ（テキスト）はメーカー提供ツールにより変換する

In [1]:
import os
import pandas as pd
import re
import termcolor
import Levenshtein # レーベンシュタイン距離ライブラリにある、ジャロ・ウインクラー距離を計算するのに使う
# jaro_dist = Levenshtein.jaro_winkler(srt1 , str2)

apf9_excel = pd.read_excel('Apex_apf9.xlsx' , sheet_name=None) # sheet_name=Noneで全てのシート読み込む

## ジャロ・ウィンクラー距離法の関数

２つの文字列の類似度を返す。１で完全一致

#### Winkler, W. E. (1990). "String Comparator Metrics and Enhanced Decision Rules in the Fellegi-Sunter Model of Record Linkage". Proceedings of the Section on Survey Research Methods. American Statistical Association: 354–359.

In [2]:
def jaro_dist(str1,str2):
    return Levenshtein.jaro_winkler(str1,str2)

In [3]:
print(apf9_excel['tech'].columns)

Index(['Unnamed: 0',
       '■：DBに新規定義する項目　■：DBに定義しない項目　■：データがあり、DB列がない項目　■：データがなく、DB列がある項目　■：次回のプロファイルで挿入',
       'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11',
       'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')


### エクセルファイルの列は目視で確認して抜き出す
エクセルファイルが更新されたら変更必要

In [4]:
msgcol = apf9_excel['scheme'].iloc[:,1]
a19msg = apf9_excel['scheme'].iloc[:,7] # ７列目がA19のmsg , 8列目はlog 以降同様に数える
a21msg = apf9_excel['scheme'].iloc[:,9]
a25msg = apf9_excel['scheme'].iloc[:,11]
a26msg = apf9_excel['scheme'].iloc[:,13]

scheme = pd.concat([msgcol,a19msg,a21msg,a25msg,a26msg], axis=1)\
.rename(columns={'■：DBに新規定義する項目　■：DBに定義しない項目　■：データがあり、DB列がない項目　■：データがなく、DB列がある項目':'field_name'\
                 , 'Unnamed: 7':'a19' , 'Unnamed: 9':'a21' , 'Unnamed: 11':'a25' , 'Unnamed: 13':'a26'})

techcol = apf9_excel['tech'].iloc[:,1]
a19msgtech = apf9_excel['tech'].iloc[:,7]
a19logtech = apf9_excel['tech'].iloc[:,8]
a21msgtech = apf9_excel['tech'].iloc[:,9]
a21logtech = apf9_excel['tech'].iloc[:,10]
a25msgtech = apf9_excel['tech'].iloc[:,11]
a25logtech = apf9_excel['tech'].iloc[:,12]
a26msgtech = apf9_excel['tech'].iloc[:,13]
a26logtech = apf9_excel['tech'].iloc[:,14]

tech = pd.concat([techcol,a19msgtech,a19logtech,a21msgtech,a21logtech,a25msgtech,a25logtech,a26msgtech,a26logtech], axis=1)\
.rename(columns={'■：DBに新規定義する項目　■：DBに定義しない項目　■：データがあり、DB列がない項目　■：データがなく、DB列がある項目　■：次回のプロファイルで挿入':'field_name'\
                 , 'Unnamed: 7':'a19msg' , 'Unnamed: 8':'a19log' , 'Unnamed: 9':'a21msg' , 'Unnamed: 10':'a21log' , 'Unnamed: 11':'a25msg' , 'Unnamed: 12':'a25log' , 'Unnamed: 13':'a26msg' , 'Unnamed: 14':'a26log' })


### テキスト生データ読み込み

バイナリからの変換はメーカー提供ツールによる

In [5]:
with open('rawdata/A26/2695.001.msg','r') as msg:
    msgline = msg.readlines()
    
with open('rawdata/A26/2695.001.log','r') as log:
    logline = log.readlines()

###  msgファイル変換

In [13]:
# msgline,loglineとも1行毎に読み込んでる。
# 安直にループさせて新しいDataFrameを作ることは出来ないので辞書リストに一度保存する（Pythonの仕様）
msgdata = {}
i = 0
for orig_msg in msgline:
    regex = r'^\$' # 先頭が＄ならその行をパラメータと判断する
    regex2 = r'^[A-Z]' # 先頭が大文字アルファベットの場合もパラメータと判断
    pattern = re.compile(regex)
    pattern2 = re.compile(regex2)

    # 行頭$スペースの2文字を除いて（の手前までをパラメータとして保存
    if (pattern.match(orig_msg) ): # ’(’以降は削除
        pos = orig_msg.find('(')
        #print(orig_msg[2:pos])
        msgdata[i] =  (orig_msg[2:pos])
        i += 1
    elif (pattern2.match(orig_msg)): # 先頭大文字、＝までもパラメータとする
        pos = orig_msg.find('=')
        #print(orig_msg[:pos])
        msgdata[i] =  orig_msg[:pos]
        i += 1

# dict_valuesをPythonの listに変換する必要はなかった。
#print(data.values())
msgdf = pd.DataFrame(msgdata.values() , columns={'field_name'})
#print(msgdf)

                          field_name
0    Mission configuration for Apf9i
1                      AscentTimeOut
2                          AtDialCmd
3                         AltDialCmd
4                      BuoyancyNudge
..                               ...
110                     Sbe41cpVolts
111                    Sbe41cpStatus
112            SurfacePistonPosition
113                  SurfacePressure
114                           Vacuum

[115 rows x 1 columns]


### logファイル変換

In [7]:
logdata = {}
i  = 0

for orig_log in logline:
# sec)が見えたらその次の文字列をパラメータとみなす
    pos = orig_log.find('sec) ')
    pos2 = orig_log.find('()')
    #print(orig_log[pos+5:pos2].split(' ')[0])
    logdata[i] = orig_log[pos+5:pos2].split(' ')[0]
    i += 1

logdf = pd.DataFrame(logdata.values(),columns={'field_name'})
#print(logdf)
#print(type(logdata))

### 比較する

#### msgファイルとスキーマ情報タブの比較

In [18]:
# 結果保存用のDFを準備
msgdf = pd.DataFrame(columns=['index','msg','xls','score'])

for line in msgdata.values():
    #print(line)
    query = scheme['a26'].str.startswith(line , na=False)
    #print(query.values)
    if (query[query == True].first_valid_index()):
        print( termcolor.colored(line + ' field exists.','blue'))
    else:
        print( termcolor.colored(line + ' is not found.' , 'red'))
        
        for index,item in scheme.iterrows():
            score = jaro_dist(str(item['a26']) , line) # 引数を入れ替えると結果が多少変わる。
            #print(line + ' is probably ' + str(item['i4']) + ' ( ' + str(round(score,2)*100) + '%)' )
            record = pd.Series([index , line , item['a26'] , score] , index=msgdf.columns)
            msgdf = msgdf.append(record , ignore_index=True)


[31mMission configuration for Apf9i is not found.[0m
[34mAscentTimeOut field exists.[0m
[34mAtDialCmd field exists.[0m
[34mAltDialCmd field exists.[0m
[34mBuoyancyNudge field exists.[0m
[34mBuoyancyNudgeInitial field exists.[0m
[34mConnectTimeOut field exists.[0m
[34mCpActivationP field exists.[0m
[34mDeepProfileDescentTime field exists.[0m
[34mDeepProfilePistonPos field exists.[0m
[31mDeepProfilePressure is not found.[0m
[34mDownTime field exists.[0m
[31mFloatId is not found.[0m
[34mFullExtension field exists.[0m
[34mFullRetraction field exists.[0m
[34mIceDetectionP field exists.[0m
[34mIceEvasionP field exists.[0m
[34mIceMLTCritical field exists.[0m
[34mIceMonths field exists.[0m
[31mMaxAirBladder is not found.[0m
[34mMaxLogKb field exists.[0m
[34mMissionPrelude field exists.[0m
[34mOkVacuum field exists.[0m
[34mPActivationPistonPosition field exists.[0m
[34mParkDescentTime field exists.[0m
[34mParkPistonPos field exists.[0m
[31mPa

     index                              msg             xls     score
0        0  Mission configuration for Apf9i             NaN  0.000000
1        1  Mission configuration for Apf9i             A26  0.000000
2        2  Mission configuration for Apf9i             msg  0.455197
3        3  Mission configuration for Apf9i             NaN  0.000000
4        4  Mission configuration for Apf9i             NaN  0.000000
...    ...                              ...             ...       ...
3388    34                           Vacuum  IceMLTCritical  0.412698
3389    35                           Vacuum       IceMonths  0.425926
3390    36                           Vacuum             NaN  0.500000
3391    37                           Vacuum      FlashErase  0.422222
3392    38                           Vacuum     FlashCreate  0.419192

[3393 rows x 4 columns]


## ソートして上位3件を表示

In [9]:
items = len(scheme.dropna()) # Index数、この数分ジャロ・ウインクラー距離を計算したら次のマッチしなかった語句になる
kazu = int( len(msgdf) / items ) # クエリーのリストになくnot foundで表示した数

for count in range(kazu):
    res = msgdf[items*count : items*(count+1)]
    msgrank = res.sort_values('score',ascending=False)[:3] # Score降順にソートして上から3つを表示

    if((re.match('^.',msgrank.iat[0,1])) is not None  ):
        # print(ranking.iat[0,3])
        # score が０と１の時は抜く（アルゴリズム？で１００％が結構出てる。
        disp_rank = str(msgrank.iat[0,1]) + ' is probably ' + str(msgrank.iat[0,2]) + ' ( ' + str(round(msgrank.iat[0,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(msgrank.iat[1,2]) + ' ( ' + str(round(msgrank.iat[1,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(msgrank.iat[2,2]) + ' ( ' + str(round(msgrank.iat[2,3] , 2) * 100 ) + '% )' + '\n' 
    
        print( disp_rank )

Mission configuration for Apf9i is probably MissionPrelude ( 88.0% )
 or PActivationPistonPosition ( 56.00000000000001% )
 or ParkPistonPos ( 52.0% )

DeepProfilePressure is probably DeepProfilePistonPos ( 100.0% )
 or DownTime ( 56.00000000000001% )
 or ParkPistonPos ( 54.0% )

DeepProfilePressure is probably DeepProfileDescentTime ( 100.0% )
 or Verbosity ( 55.00000000000001% )
 or IceDetectionP ( 54.0% )

FloatId is probably FullExtension ( 60.0% )
 or FullRetraction ( 59.0% )
 or BuoyancyNudge ( 55.00000000000001% )

FloatId is probably FlashErase ( 66.0% )
 or FlashCreate ( 65.0% )
 or AtDialCmd ( 59.0% )

MaxAirBladder is probably AirBladderMaxP ( 71.0% )
 or MaxLogKb ( 67.0% )
 or MissionPrelude ( 59.0% )

ParkPressure is probably ParkPistonPos ( 85.0% )
 or ParkDescentTime ( 81.0% )
 or MissionPrelude ( 57.99999999999999% )

ParkPressure is probably PnPCycleLen ( 61.0% )
 or FlashCreate ( 52.0% )
 or PActivationPistonPosition ( 51.0% )

RafosWindowN is probably ParkPistonPos ( 

#### msgファイルと技術情報タブの比較

In [10]:
# 結果保存用のDFを準備
msgtechdf = pd.DataFrame(columns=['index','msg','xls','score'])

for line in logdata.values():
    #print(line)
    query = tech['a26msg'].str.startswith(line , na=False)
    #print(query.values)
    if (query[query == True].first_valid_index()):
        print( termcolor.colored( line + ' field exists.' , 'blue'))
    else:
        print( termcolor.colored( line + ' is not found.' , 'red'))
        
        for index,item in tech.iterrows():
            score = jaro_dist(str(item['a26msg']) , line) # 引数を入れ替えると結果が多少変わる。
            #print(line + ' is probably ' + str(item['i4']) + ' ( ' + str(round(score,2)*100) + '%)' )
            rec_score = pd.Series([index , line , item['a26msg'] , score] , index=msgtechdf.columns)
            msgtechdf = msgtechdf.append(rec_score , ignore_index=True)
        

[31mTelemetryInit is not found.[0m
[31mAirSystem is not found.[0m
[31mGpsServices is not found.[0m
[31mGpsServices is not found.[0m
[31mgga is not found.[0m
[31mgga is not found.[0m
[31mGpsServices is not found.[0m
[31mGpsServices is not found.[0m
[31mGpsServices is not found.[0m
[31mgga is not found.[0m
[31mgga is not found.[0m
[31mGpsServices is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmea

## ソートして上位3件表示

In [143]:
# スコアの結果DF（上ではdiff_resultをソートして上から3つを表示
techlength = len(tech) # Index数、この数分ジャロ・ウインクラー距離を計算したら次のマッチしなかった語句になる
num = int( len(msgtechdf) / techlength ) # クエリーのリストになくnot foundで表示した数

for count in range(num):
    sort_df = msgtechdf[techlength*count : techlength*(count+1)]
    sort_rank = sort_df.sort_values('score',ascending=False)[:3] # Score降順にソートして上から3つを表示
    
    #print(diff_result)
    if((re.match('^.',sort_rank.iat[0,1])) is not None  ):
        #print(diff_ranking.iat[0,3])
        # score が０と１の時は抜く（アルゴリズム？で１００％が結構出てる。
        disp_rank2 = str(sort_rank.iat[0,1]) + ' is probably ' + str(sort_rank.iat[0,2]) + ' ( ' + str(round(sort_rank.iat[0,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(sort_rank.iat[1,2]) + ' ( ' + str(round(sort_rank.iat[1,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(sort_rank.iat[2,2]) + ' ( ' + str(round(sort_rank.iat[2,3] , 2) * 100 ) + '% )' + '\n' 
    
        print( disp_rank2 )


TelemetryInit is probably QuiescentAmps ( 52.0% )
 or QuiescentVolts ( 51.0% )
 or IceMLSample ( 50.0% )

AirSystem is probably AirPumpAmps ( 72.0% )
 or AirPumpVolts ( 71.0% )
 or AirBladderPressure ( 69.0% )

GpsServices is probably GpsFixTime ( 71.0% )
 or QuiescentAmps ( 61.0% )
 or QuiescentVolts ( 60.0% )

GpsServices is probably GpsFixTime ( 71.0% )
 or QuiescentAmps ( 61.0% )
 or QuiescentVolts ( 60.0% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

GpsServices is probably GpsFixTime ( 71.0% )
 or QuiescentAmps ( 61.0% )
 or QuiescentVolts ( 60.0% )

GpsServices is probably GpsFixTime ( 71.0% )
 or QuiescentAmps ( 61.0% )
 or QuiescentVolts ( 60.0% )

GpsServices is probably GpsFixTime ( 71.0% )
 or QuiescentAmps ( 61.0% )
 or QuiescentVolts ( 60.0% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.000000

#### logファイルと技術情報の比較

In [146]:
# 結果保存用のDFを準備
logtechdf = pd.DataFrame(columns=['index','msg','xls','score'])

for line in logdata.values():
    #print(line)
    query = tech['a26log'].str.startswith(line , na=False)
    #print(query.values)
    if (query[query == True].first_valid_index()):
        print( termcolor.colored( line + ' field exists.' , 'blue'))
    else:
        print( termcolor.colored( line + ' is not found.' , 'red'))
        
        for index,item in tech.iterrows():
            score = jaro_dist(str(item['a26msg']) , line) # 引数を入れ替えると結果が多少変わる。
            #print(line + ' is probably ' + str(item['i4']) + ' ( ' + str(round(score,2)*100) + '%)' )
            rec_score = pd.Series([index , line , item['a26msg'] , score] , index=logtechdf.columns)
            logtechdf = logtechdf.append(rec_score , ignore_index=True)
        

[34mTelemetryInit field exists.[0m
[31mAirSystem is not found.[0m
[34mGpsServices field exists.[0m
[34mGpsServices field exists.[0m
[31mgga is not found.[0m
[31mgga is not found.[0m
[34mGpsServices field exists.[0m
[34mGpsServices field exists.[0m
[34mGpsServices field exists.[0m
[31mgga is not found.[0m
[31mgga is not found.[0m
[34mGpsServices field exists.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmeaSentences is not found.[0m
[31mLogNmea

## ソートして上位３つを表示

In [147]:
# スコアの結果DF（上ではdiff_resultをソートして上から3つを表示
techlength = len(tech) # Index数、この数分ジャロ・ウインクラー距離を計算したら次のマッチしなかった語句になる
num = int( len(logtechdf) / techlength ) # クエリーのリストになくnot foundで表示した数

for count in range(num):
    sort_tdf = logtechdf[techlength*count : techlength*(count+1)]
    sort_trank = sort_tdf.sort_values('score',ascending=False)[:3] # Score降順にソートして上から3つを表示
    
    #print(diff_result)
    if((re.match('^.',sort_trank.iat[0,1])) is not None  ):
        #print(diff_ranking.iat[0,3])
        # score が０と１の時は抜く（アルゴリズム？で１００％が結構出てる。
        disp_rank3 = str(sort_trank.iat[0,1]) + ' is probably ' + str(sort_trank.iat[0,2]) + ' ( ' + str(round(sort_trank.iat[0,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(sort_trank.iat[1,2]) + ' ( ' + str(round(sort_trank.iat[1,3] , 2) * 100 ) + '% )' + '\n' \
        + ' or ' + str(sort_trank.iat[2,2]) + ' ( ' + str(round(sort_trank.iat[2,3] , 2) * 100 ) + '% )' + '\n' 
    
        print( disp_rank3 )

AirSystem is probably AirPumpAmps ( 72.0% )
 or AirPumpVolts ( 71.0% )
 or AirBladderPressure ( 69.0% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

gga is probably nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )
 or nan ( 56.00000000000001% )

LogNmeaSentences is probably QuiescentVolts ( 56.00000000000001% )
 or IceMLSample ( 53.0% )
 or BuoyancyPumpOnTime ( 49.0% )

LogNmeaSentences is probably QuiescentVolts ( 56.00000000000001% )
 or IceMLSample ( 53.0% )
 or BuoyancyPumpOnTime ( 49.0% )

LogNmeaSentences is probably QuiescentVolts ( 56.00000000000001% )
 or IceMLSample ( 53.0% )
 or BuoyancyPumpOnTime ( 49.0% )

LogNmeaSentences is probably QuiescentVolts ( 56.00000000000001% )
 or IceMLSample ( 53.0%