In [None]:
#Practical 1

import numpy as np
from scipy.optimize import leastsq
#Sample data
x = np.array([22,44,21,56,77])
y = np.array([56,88,90,33,32])
#Define the model function(in this case, a linear function y=mx+c)
def linear_model(params, x):
    m,c = params
    return m * x + c
#Define the residuals function(difference between actual and predicted values)
def residuals(params, y, x):
    return y - linear_model(params, x)
# Initial guess for the parameters (slope and intercept)
initial_params = [1, 1]
#Initial guess for the squares optimization
result = leastsq(residuals, initial_params, args=(y,x))
#Extract the optimized parameters
m_opt, c_opt = result[0]
#Print the results
print(f"Optimized slope(m): {m_opt}")
print(f"Optimized intercept(c): {c_opt}")





#Partical 2
import numpy as np
from scipy.optimize import minimize
#Simulated data for demonstration
np.random.seed(42)
n = 100
x = np.random.normal(n)
epsilon = np.random.normal(0, 1, n)
beta_true = [2.5, 1.7]
y = beta_true[0] + beta_true[1] * x + epsilon
#Define the moment condition function
def moment_conditions(params, x, y):
    beta0, beta1 = params
    return y - beta0 - beta1 * x
# Define the GMM objective function
def gmm_objective(params, x, y):
    moments = moment_conditions(params, x, y)
    return moments.T @ moments
# Initial guess for the parameters
initial_guess = [0.5, 0.5]
# Estimate parameters using GMM
result = minimize(gmm_objective, initial_guess, args=(x, y))
# Extract estimated parameters
estimated_beta0, estimated_beta1 = result.x
print("Estimated beta0:", estimated_beta0)
print("Estimated beta1:", estimated_beta1)



#Partical3
import numpy as np
import pandas as pd
from statsmodels.tsa.api import VAR
from sklearn.linear_model import RidgeCV
# Generate example of data
np.random.seed(0)
nobs = 10
data = np.random.randn(nobs, 2)  #2 Variables
#Convert data to DataFrame
columns = ['variable1' , 'variable2']
df = pd.DataFrame(data, columns=columns)
#Create lagged dataset
lags = 2
data_lagged = df.diff().dropna()
for lag in range(1, lags + 1):
    for col in columns:
        df[f'{col}_lag{lag}']=data_lagged[col].shift(lag)
#Drop rows with missing values
df = df.dropna()
#Split data into training and testing sets
train_size = int(0.8 * len(df))
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]
#Fit Ridge VAR model
X_train = train_data.drop(columns=columns)
y_train = train_data[columns]
ridge_model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0])  #Adjust aplhas as needed
ridge_model.fit(X_train, y_train)
coefficients = ridge_model.coef_
#Forecast using the model
X_test = test_data.drop(columns=columns)
forecast = ridge_model.predict(X_test)
#Convert forecast results to DataFrame
forecast_df = pd.DataFrame(forecast, columns=columns,index=test_data.index)
#Plot original data and forecasted values
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.plot(df, label='Original Data')
plt.plot(forecast_df, label='Forecast',linestyle='dashed')
plt.legend()
plt.title('Regularized VAR Forecast')
plt.xlabel('Time')
plt.ylabel('Values')
plt.show()



#Partical 4

import numpy as np
import pandas as pd
from statsmodels.tsa.vector_ar.vecm import VECM
from statsmodels.tsa.vector_ar.vecm import select_coint_rank
#Generate some example data
np.random.seed(0)
nobs = 100
data = np.random.randn(nobs, 2)
data[:, 1] = 2*data[:, 0] + np.random.randn(nobs) #Create a cointegrated relationship
#Create a DataFrame from the generated data
df = pd.DataFrame(data, columns=['series1', 'series2'])
#Estimate the cointegration rank using the trace statistic
coint_rank_result = select_coint_rank(df, det_order=-1, k_ar_diff=2)
#Extract the cointegration rank from the result onject
coint_rank = coint_rank_result.rank
#Initialize and fit the VECM model
model = VECM(df, k_ar_diff=2, coint_rank=coint_rank)
results = model.fit()
#Print the summary of the model
print(results.summary())


#Case study DDCM
Topic: Factors Influencing Adoption of IoT for Data-driven Decision Making in Asset Management Organizations.
Degree: Delft University of Technology, Netherlands
Author: Paul Brous, Marijn Janssen, Daan Schraven, Jasper Spiegeler and Baris Can Duzgun
Abstract :
The Internet of Things (IoT) enables the creation of data that can be used to gain further insights into the current and 
predicted state of the infrastructure and may help automate the asset management process. The objective of this paper is to 
explore implementation factors for adoption of new data sources for decision-making in asset management organizations. Based 
on a systematic literature review and case studies in the asset management domain, this paper derives the current use and 
expectations of new data sources for decision-making in asset management. The paper concludes that although recent technological
developments have enabled the deployment of IoT for asset management, the current level of adoption remains low. The inherent 
complexity of adopting a data driven approach to asset management requires an effective data governance strategy to ensure data
quality, manage expectations, build trust and integrate IoT data in decision-making processes.
Introduction :
Many organizations tasked with managing civil infrastructure routinely store large volumes of data. More and more, new sources 
provide this data for producing and collecting real world data that can be communicated on the internet, such as sensor devices, social media, and user-generated data. The Internet of Things (IoT) describes a situation whereby physical objects are connected to the Internet and are able to communicate with, and identify themselves to, other devices. For example, this may include GPS-based navigation applications for smartphones based on real-time traffic information shared by other drivers, or real-time weather service based on the information updated by sensors of users’ smartphones or weather radars and other weather observation tools. This research takes place in the asset management domain of large scale civil infrastructure. Asset management (AM) is a discipline for optimizing and applying strategies related to work planning decisions in order to effectively and efficiently meet the desired objective.
IoT is important to AM because an object that can communicate digitally also becomes connected to surrounding objects and 
data infrastructures. For example, it is possible to determine the position and length of traffic jams, and to monitor trends, 
variations, and relationships in the road network over time using smartphone data, networked sensors and cameras to analyse 
traffic flow. But in order for IoT data to be accepted by asset managers, a variety of barriers such as trust and acceptance 
still need to be overcome. The concept of trust is often used in various contexts and with different meanings. Trust is a 
complex notion which is hard to define, although its importance in data-driven decision-making is widely recognized.
Case Studies :
Two cases have been studied to identify how the adoption of IoT data is done by asset management organizations. In the first 
case we study the adoption of IoT data by a consortium for the maintenance of a bridge. In the second case we study the adoption
of IoT data for the maintenance of the road sections of a highway between two cities in the east of the Netherlands.
Case 1: Bridge Inspection with a Drone
IoT is expected to enable remote sensing of the condition of bridges and enhance the available information on their condition 
if performed correctly. For this bridge, new methods of remote sensing have been tested, in the expectation to pilot with IoT 
sensors in the succeeding year to improve the quality of monitoring. For this case, the main driver for IoT and other forms of 
remote sensing appeared to be the lack of accessibility of some parts of the bridge for visual inspections. For example, 
locations above and below the bridge there is no space for setting up equipment (e.g. scaffoldings, boom lifters or ladders) 
such that visual inspector can work. This way, parts of the bridge remain poorly inspected, making it harder to physically 
detect local cases of bridge deterioration. In combination with the innovation program of RWS, the maintenance consortium used 
the pilot project to perform inspections with help of a drone that was equipped with a camera to observe the less reachable 
parts of the bridge, thereby increasing the operation’s efficiency. The drone inspection was also performed at better reachable 
parts to compare the inspection results of the drone against the inspection results of a human inspector. This comparison gave 
new data for the usefulness of adopting IoT data, since the use of drones during inspection was relatively new for bridge 
assessments. In terms of a strategic use of IoT data, the consortium judged that the obtained information was good enough to 
give a reliable overview of the found damages at the bridge parts, which were harder to reach. This shows that the decision 
support services and performance report could be based on a more complete view of bridge data. 
In terms of a tactical use of IoT data the consortium found that the bridge inspection with the drone resulted in less costs 
than a human inspection with the needed equipment to access the areas of the bridge. Therefore, the adoption of drones results 
in a reduction of costs with respect to inspecting a bridge. On the operational use of IoT data, the adoption of a drone showed 
practical constraints. The drone did not receive a GPS signal under the bridge deck which prohibited it to follow its 
predetermined flight route. Therefore, it had to be steered manually which made the process of documenting and keeping record of
the locations of the taken inspection photographs more difficult and time consuming than expected. Secondly, the damages 
themselves were clearly visible but the extent and size were hard to measure from only the digital images. Thirdly, the 
drone had to fly at a minimum distance of 1.5 meters from the bridge components which resulted in the incapability of observing 
the bearings of the bridge and affected the completeness of the drone’s dataset. Finally, the bridge had to be closed off for 
traffic due to safety regulations. The consortium concluded that the use of drones is not ideal for assessing bridges. Another 
interesting side-note with regards to this case of adopting drones for bridge inspections, is that the consortium has made 
further plans to implement a pilot project using IoT sensors that communicate over a Long-Range Low-Power (LoRa) network to
monitor bridge movements. Robust, smart wireless sensing systems that are suitable for use in civil engineering have been 
developed specially for this project, as well as the software to analyse and interpret the data. According to the interviewees, 
these new sensing methods should speed up and improve current monitoring methods

	IoT data expected to change performance measurement of infrastructure service	IoT data expected to change perception of 
    infrastructure service	IoT data expected to change improvement processes of infrastructure service
Strategic use of IoT data	• Decision support services (trend analysis) 
• Reporting 	• Communication of long term planning and strategic choices
 • Improve perceived optimization of services 	• Encourage proactive processes
• Encourage self-organization
• Determine strategic changes to infrastructure
Tactical use of IoT data	• Cost management 
 • Time management 
 • Planning 
• Post-events    evaluations 	• Communication of short term planning and actions 
• Improve perceived quality of services
• Public enactment 	• Enable directed procedures 
• Enable efficient recovery 
• Control event occurrence 
• Improve utilization of existing infrastructure 





#Case Study :Privacy and Legal
Topic: The London Whale Case Study
Faculty: Indian Institute of Management
Author: Sumit Kumar,Pankaj Kumar Baag
Abstract:
The "London Whale" case study refers to a notable financial scandal involving JPMorgan Chase, one of the largest and most 
prominent financial institutions in the world. The case centers around large trading losses incurred by JPMorgan's 
Chief Investment Office (CIO) in 2012. The losses were initially attributed to a trader named Bruno Iksil, who worked 
at the bank's London office and became known as the "London Whale" due to the size of his trading positions.
Introduction:
The London Whale did cost JP Morgan Chase a whopping $6.2 billion loss in 2012 from trading in the synthetic credit portfolio. 
The case study raised important questions on the banks' addiction to risk and greed. In a way, what Chief Investment 
Officer (CIO) at JP Morgan did was the same old narrative. They doubled down after a loss from trade with even bigger 
gambles hoping that their big gambles would lead to a huge payday. Below is the timeline given in Table 1 culminating to 
the discovery of a financial scandal which the media christened the ‘London Whale’ However, plenty more was wrong. The CIO's 
job at the bank, it would appear, was to hold down the bank's level of credit risk. However, the CIO office used hundreds of 
billions of dollars at its disposal to speculate, as opposed to hedging, (a greater proportion emerged from deposits, which 
the company realized was always greater than the loans they issued) and to emerge as a money-maker. 
The CIO office was engaged in a web of complex derivative trading and little to do with hedging.
The London Whale Case Study
The London Whale in the case study had gone long and in a big way on credit default swaps, and this means that he sold 
vast amounts of protection against the credits making up the index. When a conference call with market analysts, Braunstein, 
who was the bank's CFO admitted that the positions meant to hedge investments the banks make in extremely top-grade assets 
with excess deposits. He also admitted that the chief investment position was made consistent with JP Morgan Chase's overall 
risk strategy. The dynamic hedging had assumed that the markets would remain efficient. This did not work out as thought. 
The huge trade had ended up distorting the index itself, i.e. the index became cheaper than the constituents. Hedge funds 
were aware of this big position and then used skew trades to "express" this view Hedge funds saw this difference and started a
rbitrage. They waited for the divergence to die down. But the market did not correct. The London Whale was actually selling vast amounts or protections and then sitting on them to an extent that the market could not autocorrect. The trade unwinds factor. In the meantime, the person engaged in selling possibly developed some cold feet. As has been stated, the person possibly stopped re-balancing the hedged, and this possibly led to an increase in the risk as well as the losses. From the bank's 10-Q form, since March 2012, the chief investment office incurred substantially mark to market losses in its SCP and which has proven to be greatly risky, and even more volatile as well as not effective as a financial hedge that the firm had believed previously. In short, JP Morgan's trade ended up distorting the market but eventually led to a massive loss due to a large unhedged exposure. The loss only goes to show how weak the legal regulatory system in place. The CEO of JP Morgan Chase was adamant and possibly right, in admitting that Mr. Iksil's gamble was legally complaint, despite the fact that they obviously violated the spirit of the Volcker Rule, and possibly, the market regulators will listen more to the Occupier's letter because the bank has now possibly fashioned risk management.
If JP Morgan Chase cannot do it, then there is no bank that can do it. He was hedging JP Morgan's credit risk, which is not 
marked to market. If corporate credit improved, you would expect that hedge to lose money, although JP Morgan as a firm would 
net profit, because the value of its loans to corporations and other businesses would go up. However, that's not why the whale 
lost money. The whale made complex bets. The bets were large enough to move the markets in their direction, which led to high 
paper profits and low reported risk. It's like if you buy a stock and keep buying, the price goes up and you look as if you're 
making money. Hedge funds noticed the prices going up, and took the other side, pushing prices back down and creating losses. 
These losses required JP Morgan to post cash collateral. Also, the losses made the positions look much riskier to the bank's 
risk models. Senior management decided to back away from the strategy, which locked in the losses but prevented further losses. 
The basic story is a common one. The bank had more risk exposure than it wanted, in this case to European corporate credit risk. This is inherent to a bank's business; part of its job is to understand credit risk and make bets on it. But it also looks for cost-effective ways to lay off some of that risk, just like an insurance company might buy reinsurance against an unexpectedly large amount and size of claims. JP Morgan appointed a trader to lay off European credit risk in an opportunistic manner. That is, they didn't want him just to buy general protection at the going rate, but to pick and choose, buying protection when and where it was cheap, and retaining credit risk when and where protection was overpriced. The trader went further and actually sold protection when and where it was overpriced.
Of course, it's also possible that the decision would have been to cancel the strategy; or to leave the strategy under the 
existing risk managers. But even in the latter case, the CEO would not have been surprised and embarrassed when the losses 
came to light. Arguably, that did as much damage as the losses themselves. Instead of denying the losses, he would have said 
they were part of a hedging strategy under risk control, that management and the board had confidence in, and that the mark 
to market losses was acceptable in light of the long-term program goals. Instead of doing a fire-sale at the worst time, the 
program could have continued until it returned to profitability, or been closed down deliberately to minimize losses. So how 
did the risk management message get lost in translation? We think the line-level risk management was unimaginative and timid, 
but not incompetent. This is the only way to establish clear responsibility for risk. It's not a question of pointing fingers 
afterward; it's making sure authority and responsibility are in the same place. As it happens, we think authority was pushed 
down too low, and responsibility pushed up too high, so when losses mounted the people in a position to do something about it 
didn't feel responsible, and the people responsible didn't have the ability to fix it.

Conclusion :
First, there are matters of law and regulation. For example, there are regulations and some laws about accurately 
describing your business strategy and reporting/calculating risk measures if you are a public company. JPM arguably 
failed to meet such standards of corporate governance at multiple levels. Among the self-identified failures and weaknesses 
were misleadingly “favourable” valuations of various positions at different times, failure to execute and monitor a stated 
strategy, identifying a hedge that had no identifiable hedging characteristics or justification, failure to enforce 
self-identified risk limits, failure to disclose material information to regulators and investors, and other things. 
Some argue that this does not prove the need for government involvement, or alternatively that the best and only role for 
government is to enforce contracts or ensure accurate information so that market discipline will do the rest. For the case of 
JP Morgan, statements from the internal and external structures varied, leading to instances of misconduct due to the lack 
of ethical considerations by the CIO and CFO, whose actions or inactions directly led to the bank making significant losses. 
Therefore, use of the virtue ethics and instrumental ethics in any financial setting tries to settle the likelihood of future 
and current financial discrepancies due to poor controls among the senior and subordinate management. On the other hand, 
ethics might be accompanied by human imperfections since some individuals are hard to implement virtue ethics and put in 
practice. This affects the wellbeing of individuals and on how they handle issues at the corporate level. The importance of 
these were demonstrated fairly conclusively during the great depression in the US and the recent financial crisis, as well as 
a long history of bank failures.






#Case Study:Legal Ascepts
Topic : A case study on Analysis under GDPR
Degree : Faculty of Computer Science and Electrical Engineering, Kiel University of Applied Science
Author : Nils Gruschka, Vasileios Mavroeidis, Kamer Vishi , Meiko Jensen
Abstract :
Big data has become a great asset for many organizations, promising improved operations and new business opportunities. However, big data has increased access to sensitive information that when processed can directly jeopardize the privacy of individuals and violate data protection laws. As a consequence, data controllers and data processors may be imposed tough penalties for non-compliance that can result even to bankruptcy. In this paper, we will discuss case studies dealing with sensitive data and actions for complying with the data regulation laws. We show which types of information might become a privacy risk, the employed privacy preserving techniques in accordance with the legal requirements, and the influence of these techniques on the data processing phase and the research results.
Introduction :
The term big data describes large or complex volumes of data, both structured and unstructured that can be analysed to bring 
value. The typical definitions refer to big data by a number of V-properties, such as volume, velocity, and variety. Today, big
data has become capital, with enterprises improving substantially their operations and customer relations, and the academia 
developing and enhancing research. In addition, the huge amount, generation speed, and diversity of data require special 
architectures for storage and processing (e.g., MapReduce or Apache Hive). In order to protect individuals and their data 
a number of technical means and regulations for privacy-preserving data processing have been initiated and developed. However, 
implementing these methods in a data processing system obviously requires additional effort during the design phase, and in 
many cases such methods influence the performance of the system. As a result, in the past, enterprises and other organizations 
were not always willing to make this effort, but this tend to change due to the pressure applied from new privacy laws and 
regulations. This paper describes privacy issues in big data analysis and elaborates on two case studies in order to elucidate 
how legal privacy requirements can be met in research projects working on big data and highly sensitive personal information. 
Finally, it discusses resulted impacts on the processing of data and the results due to the employed privacy-preserving 
techniques.
Privacy Issues in Big Data Analysis
A. Legal Regulations
From a legal point of view, in this paper we focus on the EU General Data Protection Regulation (GDPR), which came into force 
in May 2018. It is relevant to all organizations inside the European Union (EU), the European Economic Area (EEA) and also to 
organizations from other countries, if they process data of European citizens. Thus, the GDPR has effect on most major 
companies worldwide. The GDPR regulates the collection, storage, and processing of personal data. Personal data are any 
data that can be linked to a specific natural person. This includes not only direct personal identifiers (e.g., full name, 
national ID number) but also indirect identifiers like phone numbers, IP addresses, or photos with identifiable people. Data 
that do not include such identifiers are commonly regarded as anonymous and are outside the scope of GDPR. The results of big 
data analysis are very often statistical findings without direct links to specific individuals. Hence, a simple method to 
conform to all requirements of GDPR is to process only anonymous data.
A famous example of re-identification is the Netflix challenge in 2006. As part of a competition for finding more accurate 
movie recommendation methods, Netflix released a dataset containing movie ratings of 500,000 customers. In the dataset, any 
personally identifiable information (PII) was removed and only subscriber IDs (without any connection to the actual identity) 
and movie ratings (score, movie info, date) were published. However, researchers combined these data with other publicly 
available information (e.g., IMDB ratings) and were able to identify individual customers with a high probability. Other 
well-known cases include identification of individuals from internet search terms, anonymized DNA and mobility data.
Case Study : Oslo Analysis
The Operable Subjective Logic Analysis Technology for Intelligence in Cybersecurity is a research project funded under 
the ICT and Digital Innovation program of the Research Council of Norway for the University of Oslo for the period of 
2016 – 2019. Oslo Analytics develops advanced analytical methods based on big data analysis, machine learning and subjective 
logic to gain a deep situational awareness and understanding of security incidents. The project is organized in collaboration 
with national and international institutions, organizations and security vendors such as the Norwegian Computing Center (NR), 
the Norwegian National Security Authority (NSM), The Defence Intelligence College.The Norwegian Centre for Research Data (NSD) 
is responsible for implementing the statutory data privacy requirements in the research community, and thus requires 
notification from every research project processing personal data that are not fully anonymized. Fully anonymous data are 
information that cannot in any way identify an individual either directly through name and national identity number, or 
indirectly through background variables, a name list, scrambling key, encryption formula or code.
1. Handling Sysmon Data – End Point Security
Data of particular research importance for Oslo Analytics are Sysmon logs. Sysmon is a Windows system service and device 
driver that monitors and logs system activity of Windows workstations, servers and domain controllers. Sysmon provides some 
of the most effective events needed to trace attacker activity and increase host visibility. Sysmon event class 
”Network Connection” with ID 3 can be used to identify network activity, such as connections to command and control servers 
(C&C) or even download encryption keys.
Like many other datasets, Sysmon contains multiple privacy-sensitive identifiers (Windows account usernames, computer 
names, static internal IPs) and user-behaviour (running processes, internet activity) that Oslo Analytics has to deal with 
prior processing.
2. Data Storage and Accessibility
The data are stored on a secure server with access restricted to authorized researchers working on Oslo Analytics 
under a very tight access control list adopting the principle of least privilege. Processing of the data can only 
occur on the server. Access to the secure server is only allowed from inside the organizational network and this 
is restricted to specific computers filtered by their MAC addresses, their internal static IP, and user account. 
In addition, a firewall has been configured to allow only incoming connections to the server on port 22 (SSH). 
Any other network activity is denied and consequently dropped. In this respect, the network restrictions 
disallowed us to personally install any extra programming libraries needed for processing the data after 
setting up the server. Thus, we had to inform the security team that is responsible for the security of the server 
and the data stored. Finally, the user accounts for processing the data on the secure server are only valid for the 
duration of the project (account expiration), meaning that the accounts will be disabled on a specific date. The same 
principle applies to the Sysmon data which restricts the duration of the data storage to the active period of the project.
Conclusion :
This paper presented the implications of data protection laws on projects dealing with big data, and by using case 
studies analysed how privacy-preserving techniques can be applied. The results were quite different. Mitigating 
privacy concerns regarding biometric data collection and processing the participants were asked to give consent. 
In addition, no problems were faced during the data analysis phase. Data from an existing data source were used. 
Here, anonymization of many data fields was required, making the data analysis more challenging and in many cases limited. 
It is of great importance to remark that for projects and technologies dealing with sensitive data a data protection impact 
assessment should be conducted at the very early stages of the project to identify potential privacy challenges, and to 
adapt the analysis methods taking into consideration privacy-preserving techniques.








#Case Study: Emerging Technology
Topic : Big Data Emerging Technologies : A Case Study with Analyzing Twitter Data Using Apache Hive
Faculty : Computer Science & Engineering Department
Author : National Institute of Technical Teachers Training and Research, Chandigarh, India 
Abstract :
These are the days of Growth and Innovation for a better future. Now-a-days companies are bound to realize need of Big Data to make decision over complex problem. Big Data is a term that refers to collection of large datasets containing massive amount of data whose size is in the range of Petabytes, Zettabytes, or with high rate of growth, and complexity that make them difficult to process and analyze using conventional database technologies. Big Data is generated from various sources such as social networking sites like Facebook, Twitter etc, and the data that is generated can be in various formats like structured, semi-structured or unstructured format. For extracting valuable information from this huge amount of Data, new tools and techniques is a need of time for the organizations to derive business benefits and to gain competitive advantage over the market. In this paper a comprehensive study of major Big Data emerging technologies by highlighting their important features and how they work, with a comparative study between them is presented. This paper also represents performance analysis of Apache Hive query for executing Twitter tweets in order to calculate Map Reduce CPU time spent and total time taken to finish the job.
Introduction :
Digital universe is flooded with large amount of data generated by number of users worldwide. These data are of diverse in 
nature, come from various sources and in many forms. Every time we use Internet, send an email, make a phone call, or pay a 
bill, we create data. All this data needs to be stored in huge data chunks. These data chunks are stored in thousands of disks 
or hard drives. Around 2.72 zettabytes of data were created until 2012 and it is expected to double every two years reaching 
about 8 zettabytes at the end of 2015. Multimedia industries and increase use of social networking sites are the major source 
of Big Data generation. Every minute Facebook users shares nearly 3.3 million pieces of content, Twitter user sent 347,22 
tweets, 100 hours of videos uploaded on YouTube, and 4.1 million search queries are executed on Google every minute. So, 
in order to get business value from this large amount of data generated Hadoop and its Ecosystems are the popular solution 
which can help out for better Big Data Analytics solution. This paper is organized as follows. An overview of BigData and 
Hadoop is presented. Hadoop Ecosystem with analyzing Twitter data by using Apache Hive configured on Microsoft HDInsight 
Hadoop cluster is discussed. 
Hadoop For Big Data Processing
The main challenge in front of IT world is to store and analyze huge quantities of data. Every single day data is generated 
in huge amount from various fields like Geography, Engineering, and Economics & Science etc. To analyze such huge amounts 
of data for better understanding of users there is a need to develop data intensive applications which are highly available, 
highly scalable and based on the reliable storage system. To cope up with these requirements in 2003, Google developed 
Distributed File System (DFS) called Google File System GFS and introduced MapReduce programming model to achieve high 
performance by moving tasks to the nodes where the data is stored and by executing them in parallel. GFS was a great 
discovery in order to handle massive data for storing, retrieving, processing and analyzing. But, the major issue with 
GFS was that this file system was proprietary, so the researcher team of Yahoo developed an open source implementation of 
GFS and Map-Reduce and later this open-source project was named as Apache Hadoop. Hadoop was created by Doug Cutting, an 
employee at Yahoo for the Nutch search engine project. By seeing his son’s toy elephant Doug named it as Hadoop with yellow 
elephant like symbol. Hadoop architecture mainly comprise of two main components: HDFS for storing Big Data and MapReduce 
for Big Data analytics.
HDFS Architecture :
Hadoop Distributed File System(HDFS) is a file system which is used for storing large datasets in a default block of size 
64 MB in distributed manner on Hadoop cluster. Hadoop cluster means running a set of daemons on different servers of the 
network.
MapReduce Architecture :
a. Map Stage
Map is a function that splits up the input text, so map function is written in such a way that multiple map jobs can be 
executed at once, map is the part of the program that divide up the tasks. This function takes key/value pairs as input 
and generates an intermediate set of key/value pairs.
b. Reduce Stage
Reduce is a function that receives the mapped work and produces the final result. The working of Reduce function 
depends upon merging of all intermediate values associated with the same intermediate key for producing the final result.
Case Study for Analysing Twitter Data
Experimental Setup
Social websites like Twitter are a useful source of Big-Data for analyzing and understanding users trends. 
The main objective of this work is to fetch and analyze Twitter tweets which are stored as JavaScript Object Notation 
(JSON) format on cloud based Apache Hive solution. For the implementing of this work we have used Microsoft Azure cloud 
services. Two Infrastructure- as a Service (IaaS) services were used: one is HDInsight Hadoop solution. First, Twitter 
live data were fetched by using Twitter Streaming API and then fetched raw data was stored into Blob Storage after 
that it is transferred into Hive Table. Now HDInsight cluster of various nodes size was created on which Apache 
Hive queries were executed to analyze the Twitter tweets.
Hive query was executed on the data stored in Hive table and the results of a number of tweets count were calculated. 
The result of HDInsight cluster for running Hive query were analyzed based on two parameters: first one is Total Map 
    Reduce CPU Time Spent for running Hive query and second is Total Time taken for running this job. HDInsight Hadoop 
    cluster of size 1 node, 2 nodes, 4 nodes, and 6 nodes are used. The results for mapreduce CPU time spent on Hadoop 
    cluster measured in seconds when the hive query was executed for feteching and predicting tweets count. 
    It is observed that as the number of nodes in HDInsight cluster increase the mapreduce slot time for executing 
    hive query increase because more number of nodes in cluster means more switching of mapper and reducer function 
    on the cluster nodes. It is observed that as the number of nodes in HDInsight cluster increase total time taken 
    to execute Hive query decrease because if we increase number of nodes in HDInsight cluster then processing of 
    Hive query can take place parallely and which will decrease the query execution time.
Conclusion :
Big Data analysis is the latest area of interest for the research communities around the globe. Big Data refers to 
the volume of data beyond the traditional database technology capacity to store, access, manage and compute efficiently. 
By analyzing this large amount of data companies can predict the customer behavior, improved marketing strategy, and get 
competitive advantages in the market. Hadoop is a flexible and open source implementation for analyzing large datasets 
using MapReduce. There are various emerging technologies such as Apache Pig, Hive, Sqoop, HBase, Zookeeper, and Flume 
that can be used to improve the performance of basic Hadoop MapReduce framework. Apache Pig which is a scripting language 
that can be used to reduce development time of MapReduce program because it requires less number of lines of code and 
provides nested data types that are missing from MapReduce. Hive provides easy to use platform for the developers who 
are comfortable in SQL language for Map Reduce programming. HDFS has the inability of random read/write to BigData that 
can be provided by HBase. If we want to transfer data between Hadoop and RDBS system Sqoop can be used. Zookeeper can be 
used for synchronization of Hadoop cluster and finally Flume can be used for moving streaming web log data to HDFS. 
This paper also discussed fetching and executing Twitter tweets by using Hive query on HDInsight cluster and results 
shows that as we increase number of nodes in the cluster.
Case Study for Analysing Twitter Data
Experimental Setup
Social websites like Twitter are a useful source of Big-Data for analyzing and understanding users trends. 
The main objective of this work is to fetch and analyze Twitter tweets which are stored as JavaScript Object Notation (JSON) 
format on cloud based Apache Hive solution. For the implementing of this work we have used Microsoft Azure cloud services. 
Two Infrastructure- as a Service (IaaS) services were used: one is HDInsight Hadoop solution. First, Twitter live data were 
    fetched by using Twitter Streaming API and then fetched raw data was stored into Blob Storage after that it is 
    transferred into Hive Table. Now HDInsight cluster of various nodes size was created on which Apache Hive queries were 
executed to analyze the Twitter tweets.
Hive query was executed on the data stored in Hive table and the results of a number of tweets count were calculated. The result of HDInsight cluster for running Hive query were analyzed based on two parameters: first one is Total Map Reduce CPU Time Spent for running Hive query and second is Total Time taken for running this job. HDInsight Hadoop cluster of size 1 node, 2 nodes, 4 nodes, and 6 nodes are used. The results for mapreduce CPU time spent on Hadoop cluster measured in seconds when the hive query was executed for feteching and predicting tweets count. It is observed that as the number of nodes in HDInsight cluster increase the mapreduce slot time for executing hive query increase because more number of nodes in cluster means more switching of mapper and reducer function on the cluster nodes. It is observed that as the number of nodes in HDInsight cluster increase total time taken to execute Hive query decrease because if we increase number of nodes in HDInsight cluster then processing of Hive query can take place parallely and which will decrease the query execution time.
Conclusion :
Big Data analysis is the latest area of interest for the research communities around the globe. 
Big Data refers to the volume of data beyond the traditional database technology capacity to store, access, 
manage and compute efficiently. By analyzing this large amount of data companies can predict the customer behavior, 
improved marketing strategy, and get competitive advantages in the market. Hadoop is a flexible and open source 
implementation for analyzing large datasets using MapReduce. There are various emerging technologies such as 
Apache Pig, Hive, Sqoop, HBase, Zookeeper, and Flume that can be used to improve the performance of basic Hadoop 
MapReduce framework. Apache Pig which is a scripting language that can be used to reduce development time of MapReduce 
program because it requires less number of lines of code and provides nested data types that are missing from MapReduce. 
Hive provides easy to use platform for the developers who are comfortable in SQL language for Map Reduce programming. 
HDFS has the inability of random read/write to BigData that can be provided by HBase. If we want to transfer data between 
Hadoop and RDBS system Sqoop can be used. Zookeeper can be used for synchronization of Hadoop cluster and finally Flume 
can be used for moving streaming web log data to HDFS. This paper also discussed fetching and executing Twitter tweets by 
using Hive query on HDInsight cluster and results shows that as we increase number of nodes in the cluster.



