In [2]:
import ast
import pandas as pd
import inspect
from sympy import sympify
 
'''
The goal is to do a complete analysis include check_isMyLanguage, infer_type_check, and infer_memory on the programme written in my sub_language

By overridding the Visit node function in python and recursively call the function on the node and child node,we can easily check if it is under the rule of my sub language.
By using the evm={} to store the necessary information for variables we create, so that we can get the information later when the variable is used afterwards.


there are three parts:
1.check_isMyLanguage: check if every part eg. ops is in the set of My language token,
                      check the attribute to check if the only function call is pd.DataFrame and pd.concat
                 
2.type_infer_check: checking every detail to ensure that + - can only be operated in List, Tuple or DataFrame
                    boolop  can only be operated on bool
                    the concat function can only be List of DataFrame
                    the DataFrame can only be Tuple or List or DataFrame
                    recursively, we can check all.
  approach: maintain the evm={'id':type} to store all the variable type we create eg. x, so we can infer the type of like x+1 afterwards.
  the result is a list of type infer for every statement in the programme.
                  
3.infer_memory: in the recursively way, we can check all the expression and variable memory cost, then add up.
  approach: for the variables created: maintain the evm={'id':number of bytes} to store all the variable bytes we create eg. x, so we can infer the memory of eg.x+1 easily''' 
    
'''


BNF:

program ::= statement
           |statement program 
statement =Return(expr)
     | Assign(expr* targets, expr value)
     | Expr(expr value)
        
expr = BinOp(expr left, operator op, expr right)
         |BoolOp(boolop op, expr* values)
         | Call(expr Func, expr* args, keyword* keywords)
         | Positive_Int(object n)
         |Str(string s)
         | Attribute(expr value, identifier Attr)
         | Name(identifier id)
         | List(expr* elts)
         | Tuple(expr* elts)
         
operator ::= Add | Sub 
boolop::= And|Or
Attr ::= concat|DataFrame  
arguments = (arg* args)
arg = (identifier arg, expr? annotation)

'''


'''
my SubLanguage can only support positive integer,bool, List,Tuple,dataframe
the operations can only be add and sub
the function can be called are only pd.DataFrame and pd.concat

you can 
run is_MySubLanguage(ast.NodeVisitor)  to test if it is using only my_sublaguage token
run infer_type_check(ast.NodeVisitor)   to test if the type is right and infer the type as none if not right; it will print information if not right
                                                if it's under MysubLanguge rule, it would infer the type
run infer_memory(ast.NodeVisitor) to estimate the memory use. the unit is bytes.
                                     (count int as 2 bytes and bool as 1 byte)


'''

class is_MySubLanguage(ast.NodeVisitor):
    def visit_Module(self, node):
        #ev={}
        results = [self.visit(s) for s in node.body]
        return all(results)

    def visit_Expr(self, node):
        return self.visit(node.value)
    def visit_BinOp(self, node):
        if type(node.op) in [ast.Add,ast.Sub]:
            return self.visit(node.left) and self.visit(node.right)
        else:
            return False
    def visit_BoolOp(self, node):
        if type(node.op) in [ast.And,ast.Or]:
            re = [self.visit(s) for s in node.values]
            return all(re)
        else:
            return False
      
    # can also assign as tuple eg. (x,y,z)=(1,2,3)  or x=1
    def visit_Assign(self, node):
        if (type(node.targets[0]) is ast.Tuple) and (type(node.value) is ast.Tuple) :
            return True
        if (type(node.targets[0]) is ast.Name):
            return self.visit(node.value)
        else:
            return False

    def visit_Return(self, node):
        return self.visit(node.value) 
    
    def visit_Call(self, node):          
        if type(node.func) is ast.Attribute:
#         if type(node.func) is ast.Name:
#             if node.func.id in ev:
#                 return True
#             else:
#                 print("Warning !! the function"+ node.func.id+" is undefined; So it doesn't obey as My Embeded Language rule")
#                 return False  
            if self.visit(node.func):
                if type(node.args[0]) in [ast.List,ast.Dict,ast.Tuple]:                   
                    return True
                if type(node.args[0]) is ast.Name:
                    print("need furthur type-infer-check for the variable "+node.args[0].id)
                    return True
                else:
                    print("the input data is not valid for pd.DataFrame!")
                    return False
            else:
                return False
                                
        # My subset of language can only call DataFrame and concat function as pd.DataFrame(y) or pd.concat(y)
    def visit_Attribute(self, node):        
        #print(node.attr)
        if node.attr in ['DataFrame','concat']:
            if node.value.id=='pd':
                return True                            
            else:
                print("The right way to use is as pd.attr(x)!")
                return False                           
        else:
            print("you can only use pd.concat or pd.DataFrame ")
            return False            
    def visit_Name(self, node):
        return True
    def visit_Num(self, node):
        return True
    def visit_List(self, node):
        return True
    def visit_Tuple(self, node):
        return True
    def visit_Str(self, node):
        return True   
    def generic_visit(self,node):
        return False

#     def visit_FunctionDef(self, node):      
#         results = [self.visit(s) for s in node.body]
#         ev[node.name]="f"
#         return all(results)

a1=ast.parse('''
data = [['sss',20],['B',30],["ssdd",10]]
df2 = pd.DataFrame(data,columns=['Name','Age'])
data2 = [['Alex',10],['Bob',12]]

df = pd.DataFrame(data,columns=['Name','Age'])  #
f=[df2,df]
pd.DataFrame(f)
'''  )

a2=ast.parse('''
-3

'''  )

a3=ast.parse('''
not True  
'''  )
b=ast.dump(a3)
a4=ast.parse('''
x={1,2,3}

'''  )
b=ast.dump(a3)
print(b)
print("test1:"+str(is_MySubLanguage().visit(a1))+"\n\n")
print("test2:"+str(is_MySubLanguage().visit(a2))+"\n\n")
print("test3:"+str(is_MySubLanguage().visit(a3))+"\n\n") # since uni op and dict  like not is not in my sublanguage
print("test4:"+str(is_MySubLanguage().visit(a4))+"\n\n")

Module(body=[Expr(value=UnaryOp(op=Not(), operand=NameConstant(value=True)))])
need furthur type-infer-check for the variable data
need furthur type-infer-check for the variable data
need furthur type-infer-check for the variable f
test1:True


test2:False


test3:False


test4:False




In [4]:
'''
the parameter for the + - can only be tuple, List, positive_integer
'''
class infer_type_check(ast.NodeVisitor):

    def visit_Module(self, node):
        global evm
        evm={}
        results = [self.visit(s) for s in node.body]
#         if results == [int] or results == [bool]:
        return results

    def visit_Expr(self, node):
       # print("expr"+str(self.visit(node.value)))
        return self.visit(node.value)
    
    def visit_Assign(self, node):
        if (type(node.targets[0]) is ast.Tuple) and (type(node.value) is ast.Tuple) :
            i=0
            for i in range (0,len(node.targets[0].elts)):
                x=self.visit(node.value.elts[i] )  
                evm[node.targets[0].elts[i].id]=x
            return  x
                    
        if (type(node.targets[0]) is ast.Name):
            an=self.visit(node.value)
            evm[node.targets[0].id]=an
#             print(an)
            return an

    def visit_Call(self, node):
        if type(node.func) is ast.Attribute: 
            # pd.DataFrame  
            if self.visit(node.func)==1:
                if type(node.args[0]) is ast.Name:
                        if evm[node.args[0].id] in [ type(pd.DataFrame([1,2,3])), ast.List]:
                            return type(pd.DataFrame([1,2,3]))
                        else:
                            print("the parameter"+ node.args[0].id+"for pd.DataFrame must be  List or DataFrame")
                            return None
                if self.visit(node.args[0]) in [ast.List, type(pd.DataFrame([1,2,3]))]:          
                    return type(pd.DataFrame([1,2,3]))
                else:
                    print("the parameter for pd.DataFrame must be  List or DataFrame")
                    return None
                return type(pd.DataFrame([1,2,3]))
        
            # the function is concat, then the parameters can only be List of DataFrame
            
            if self.visit(node.func)==2:   
                if type(node.args[0]) is ast.Name:
                    if evm[node.args[0].id] in [ast.List,ast.Tuple]:
                        print(node.args)
                        for t in node.args[0].elts:
                            if type(t) is ast.Name:
                                if evm[t.id] is type(pd.DataFrame([1,2,3])):
                                    continue
                                else:
                                    print("the parameter for pd.concat must be List of DataFrame")
                                    return None
                            if type(t) is  type(pd.DataFrame([1,2,3])):                                
                                continue
                            if self.visit(t) is type(pd.DataFrame([1,2,3])):
                                print("qqqqqq")
                                continue
                            else:
                                print("the parameter for pd.concat must be List of DataFrame")
                                return None                      
                    else:
                        print("!the parameter for pd.concat must be List or Tuple of DataFrame")
                        return None
                    
                if type(node.args[0]) in [ast.List,ast.Tuple]:                    
                    for t in node.args[0].elts:
                        if type(t) is ast.Name:
                            if evm[t.id] is type(pd.DataFrame([1,2,3])):      
                                continue 
                            else:
                                print("the parameter for pd.concat must be List of DataFrame")
                                return None
                        if type(t) is  type(pd.DataFrame([1,2,3])):                                
                                continue
                        if self.visit(t) is type(pd.DataFrame([1,2,3])):
                                print("0000")
                                continue
                        else:
                            print("the parameter for pd.concat must be List of DataFrame")
                            return None
                    return type(pd.DataFrame([1,2,3])) 
                else:
                    return type(pd.DataFrame([1,2,3]))

    def visit_Attribute(self, node):        
        if node.attr=='DataFrame':
            return 1                            
        if node.attr=='concat':                
            return 2    
        else:
            return 0
        
    def visit_Name(self, node):
        if node.id in evm:
            return evm[node.id]
        else:
            print("the variable"+node.id+" is unassigned, so can not infer type!")             
            return None
    def visit_BoolOp(self, node):
        if set([self.visit(v) for v in node.values]) == {bool}:
            return bool

    def visit_BinOp(self, node):
        if type(node.op) in [ast.Add, ast.Sub]:
            if self.visit(node.left) in [ast.Tuple,ast.List,int,type(pd.DataFrame([1,2,3]))] and\
               self.visit(node.right) in [ast.Tuple,ast.List,int,type(pd.DataFrame([1,2,3]))]:
                if self.visit(node.right)==self.visit(node.left):
#                     print(self.visit(node.right))
                    return self.visit(node.right)
  
    def visit_Num(self, node):
        if type(node.n) is int:
            return int
    def visit_List(self, node):   
        return ast.List
    
    def visit_Tuple(self, node):
        return ast.Tuple  
    def visit_Str(self, node):
        return ast.String  
   
    def visit_NameConstant(self, node):
        if node.value in [True, False]:
            return bool
    def generic_visit(self,node):
        print("the type is not defined in MySubLanguage! so the type infer is None")
        return None

a11=ast.parse('''
data = [['sss',20],['B',30],["ssdd",10]]
df2 = pd.DataFrame(data,columns=['Name','Age'])
data2 = [['Alex',10],['Bob',12]]
df = pd.DataFrame(data,columns=['Name','Age'])  #
f=[df,df2]
pd.concat((df,df2))

''')
a22=ast.parse('''
x=1
x=pd.concat([x,[3,4]])   
'''  )

a33=ast.parse('''
a=1
1+2+True
x={1,2,3}
1+True
'''  )
# print(ast.dump(a11))
# evm['x']
print("test1:"+str(infer_type_check().visit(a11))+"\n\n")
print("test2:"+str(infer_type_check().visit(a22))+"\n\n") # since concat can only be list or tuple of DataFrame
print("test3:"+str(infer_type_check().visit(a33))+"\n\n")



test1:[<class '_ast.List'>, <class 'pandas.core.frame.DataFrame'>, <class '_ast.List'>, <class 'pandas.core.frame.DataFrame'>, <class '_ast.List'>, <class 'pandas.core.frame.DataFrame'>]


the parameter for pd.concat must be List of DataFrame
test2:[<class 'int'>, None]


the type is not defined in MySubLanguage! so the type infer is None
test3:[<class 'int'>, None, None, None]




In [9]:
# in my memory count: {'int': 2 bytes,'bool':1 bytes}

class infer_memory(ast.NodeVisitor):
    def visit_Module(self, node):
        global evm
        evm={}
        total=0
        for s in node.body:
            total=total+self.visit(s) 
        return total

    def visit_Expr(self, node):
        return self.visit(node.value)

    def visit_Return(self, node):
        return self.visit(node.value) 
    def visit_BinOp(self, node):
        if type(node.op) in [ast.Add, ast.Sub]:
            return self.visit(node.left)+self.visit(node.right)
    def visit_BoolOp(self, node):
        if type(node.op) in [ast.And,ast.Or]:
            re = [self.visit(s) for s in node.values]
            sum=0
            for t1 in re:
                sum=sum+t1
            return sum

    def visit_Call(self, node):
        if type(node.func) is ast.Attribute:             
            return self.visit(node.args[0])
       
    def visit_Name(self, node):
        if node.id in evm:
            return evm[node.id]
        else:
            print("the variable"+node.id+" is unassigned, so can not calculate memory!")             
            return sympify(node.id + "_memory")
    def visit_Assign(self, node):
        if (type(node.targets[0]) is ast.Tuple) and (type(node.value) is ast.Tuple) :
            i=0
            for i in range (0,len(node.targets[0].elts)):
                x=self.visit(node.value.elts[i] )  
                evm[node.targets[0].elts[i].id]=x
                zz=str(len(node.targets[0].elts))+"*"+str(self.visit(node.value.elts[0]))
            return  sympify(zz)
                    
        if (type(node.targets[0]) is ast.Name):
            an=self.visit(node.value)
            evm[node.targets[0].id]=an
            return an

    def visit_Num(self, node):
        if type(node.n) is int:
            return 2    
    def visit_NameConstant(self, node):
        if node.value in [True, False]:
            return 1
    def visit_Name(self, node):
        if node.id in evm:
            return evm[node.id]
        else:
            print("the variable is not assigned! Don't know the memory")

    def visit_List(self, node):
        tt=0
        for t in node.elts:
            tt=tt+self.visit(t)     
        return tt
    def visit_Tuple(self, node):
        tt=0
        for t in node.elts:
            tt=tt+self.visit(t)            
        return tt
    def visit_Str(self, node):
        return len(node.s)        
#     def visit_FunctionDef(self, node):
#          stack={}            
#         for s in node.body:
#                 if type(s) is ast.Assign:
#                     evm
#         results = [self.visit(s)]
#         ev[node.name]="f"
#         return all(results)  
# b=ast.dump(a)
# print(b)

a111=ast.parse('''
data = [['sss',20],['B',30],["ssdd",10]]
df2 = pd.DataFrame(data,columns=['Name','Age'])
data2 = [['Alex',10],['Bob',12]]
df = pd.DataFrame(data,columns=['Name','Age'])  #
f=[df,df2]
pd.concat((df,df2))

zz=df+df2
''')


a222=ast.parse('''
x=1
y=2
q=True
z=x+y
'''  )
print(ast.dump(a222))
# evm['x']
print("test1:"+str(infer_memory().visit(a111))+"bytes\n\n")
print("test2:"+str(infer_memory().visit(a222))+"bytes\n\n") # since concat can only be list or tuple of DataFrame



Module(body=[Assign(targets=[Name(id='x', ctx=Store())], value=Num(n=1)), Assign(targets=[Name(id='y', ctx=Store())], value=Num(n=2)), Assign(targets=[Name(id='q', ctx=Store())], value=NameConstant(value=True)), Assign(targets=[Name(id='z', ctx=Store())], value=BinOp(left=Name(id='x', ctx=Load()), op=Add(), right=Name(id='y', ctx=Load())))])
test1:137bytes


test2:9bytes




In [12]:
# data2 = [['Alex',10],['Bob',12]]
# data = [['sss',20],['B',30],["ssdd",10]]
# df2 = pd.DataFrame(data2)
# df = pd.DataFrame(data)  #

# pd.concat([df+df2])
